diff --git a/tests/fakelowp/test_batchmatmul_nnpi_fp16.py b/tests/fakelowp/test_batchmatmul_nnpi_fp16.py deleted file mode 100644 index fd8b7f07d6..0000000000 --- a/tests/fakelowp/test_batchmatmul_nnpi_fp16.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np -import glow.fb.test.init_shared_libs # noqa - -from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from glow.fb.test.test_utils import print_test_debug_info -from hypothesis import given -import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -import caffe2.python.serialized_test.serialized_test_util as serial - -core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"]) - -GLOW_MATMUL_RTOL = 1e-3 - - -class TestBatchMatMul(serial.SerializedTestCase): - # @settings(max_examples=30) - @given( - #C=0, #st.integers(min_value=0, max_value=3), # number of batch dims - M=st.integers(min_value=1, max_value=10), - K=st.integers(min_value=1, max_value=10), - N=st.integers(min_value=1, max_value=10), - trans_a=st.booleans(), - trans_b=st.booleans(), - run_ints=st.booleans(), - **hu.gcs - ) - def test_batch_matmul(self, M, K, N, trans_a, trans_b, run_ints, gc, dc): - workspace.ResetWorkspace() - C = 0 # TODO - batch_dims = np.random.randint( - low=1, - high=3, - size=C, - dtype=np.int64).tolist() - - if run_ints: - X = np.random.randint(low=1, high=3, size=((1, M, K))).astype(np.float32) - else: - X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(np.float32) - 0.5) - if trans_a: - X = X.swapaxes(-1, -2) - - if run_ints: - Y = np.random.randint(low=1, high=3, size=((1, K, N))).astype(np.float32) - else: - Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(np.float32) - 0.5) - if trans_b: - Y = Y.swapaxes(-1, -2) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", "Y"]) - pred_net.external_output.append("out") - pred_net.op.add().CopyFrom( - core.CreateOperator( - 'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b - ) - ) - - pred_net_ref = core.Net("pred_net_ref") - pred_net_ref.BatchMatMulFP16Acc16Fake( - ["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) - - print("dims", batch_dims, X.shape, Y.shape) - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - {"X": X.shape, "Y": Y.shape}, - debug=True, - adjust_batch=False, - use_onnx=False) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - workspace.FeedBlob("X", X) - workspace.FeedBlob("Y", Y) - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(pred_net_ref) - - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - out_glow = workspace.FetchBlob('out') - - # Run caffe2 net - workspace.RunNet(pred_net_ref) - out_c2_fakefp16 = workspace.FetchBlob('out') - - diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8)) - rowdiff = np.max(diff, axis=1) - - success = True - if run_ints: - if not np.allclose(out_glow, out_c2_fakefp16): - success = False - else: - n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) - # Find the max difference per row, if more than 10% of the rows - # are bigger, consider it a failure. - if n_offenders * 10 > rowdiff.shape[0]: - success = False - - if not success: - print_test_debug_info("bmm", - {"m": M, "k": K, "n": N, "X": X, "Y": Y, - "out_glow": out_glow, - "out_c2_fakefp16": out_c2_fakefp16, - "diff": diff}) - assert(0) - - -if __name__ == "__main__": - import unittest - unittest.main() diff --git a/tests/fakelowp/test_batchnorm_nnpi_fp16.py b/tests/fakelowp/test_batchnorm_nnpi_fp16.py deleted file mode 100644 index 1d45c2c465..0000000000 --- a/tests/fakelowp/test_batchnorm_nnpi_fp16.py +++ /dev/null @@ -1,142 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np - -import glow.fb.test.init_shared_libs # noqa -import time -from caffe2.proto import caffe2_pb2 -from caffe2.python import core -from caffe2.python import workspace -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from caffe2.python.onnx.tests.test_utils import TestCase -from glow.fb.test.test_utils import print_test_debug_info - -core.GlobalInit(["caffe2", "--glow_global_fp16=1", - "--glow_global_fused_scale_offset_fp16=1", - "--glow_global_force_sls_fp16_accum=1"]) - -GLOW_LOWERED_BATCHNORM = False - - -def reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order): - X = X.astype(np.float16) - scale = scale.astype(np.float16) - bias = bias.astype(np.float16) - mean = mean.astype(np.float16) - #var = var.astype(np.float16) - assert(order == "NCHW") - - scale = scale[np.newaxis, :, np.newaxis, np.newaxis] - bias = bias[np.newaxis, :, np.newaxis, np.newaxis] - mean = mean[np.newaxis, :, np.newaxis, np.newaxis] - var = var[np.newaxis, :, np.newaxis, np.newaxis] - Y = ((X - mean) * (scale / np.sqrt(var + epsilon).astype(np.float16))) + bias - return Y.astype(np.float32) - - -# Test the lowered BN op -class BatchnormTest(TestCase): - # TODO: replace with hypothesis - def test_bn(self): - size = 30 - input_channels = 20 - batch_size = 40 - seed = int(time.time()) - np.random.seed(seed) - - order = "NCHW" - epsilon = 1e-3 - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SpatialBN", - ["X", "scale", "bias", "mean", "var"], - ["Y"], - order=order, - is_test=True, - epsilon=epsilon - ) - ) - - if GLOW_LOWERED_BATCHNORM: - refopname = "SpatialBNFakeLoweredFp16NNPI" - else: - refopname = "SpatialBNFakeFp16NNPI" - - pred_net_ref = caffe2_pb2.NetDef() - pred_net_ref.name = "pred" - pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"]) - pred_net_ref.external_output.append("X") - pred_net_ref.op.add().CopyFrom( - core.CreateOperator( - refopname, - ["X", "scale", "bias", "mean", "var"], - ["Y"], - order=order, - is_test=True, - epsilon=epsilon - ) - ) - - scale = np.random.rand(input_channels).astype(np.float32) + 0.5 - bias = np.random.rand(input_channels).astype(np.float32) - 0.5 - mean = np.random.randn(input_channels).astype(np.float32) - var = np.random.rand(input_channels).astype(np.float32) + 0.5 - X = np.random.rand( - batch_size, input_channels, size, size).astype(np.float32) - 0.5 - - workspace.FeedBlob("scale", scale) - workspace.FeedBlob("bias", bias) - workspace.FeedBlob("mean", mean) - workspace.FeedBlob("var", var) - - # Use for reference to debug - # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order) - - pred_net_onnxified = onnxifi_caffe2_net( - pred_net, - {"X": [batch_size, input_channels, size, size], - "scale": [input_channels], - "bias": [input_channels], - "mean": [input_channels], - "var": [input_channels]}, - debug=True, - adjust_batch=False, - use_onnx=False - ) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - workspace.FeedBlob("X", X) - - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(pred_net_ref) - - workspace.RunNet(pred_net_ref.name) - Y_c2 = workspace.FetchBlob("Y") - - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob("Y") - - if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): - diff = np.abs(Y_glow - Y_c2).astype(np.float16) - print_test_debug_info( - "bn", - {"seed": seed, - "scale": scale, - "bias": bias, - "mean": mean, - "var": var, - "Y_np": Y_c2.shape, - "Y_glow": Y_glow.shape, - "diff": diff, - "rowwise_diff": np.max(np.abs(diff), -1)}) - assert(0) diff --git a/tests/fakelowp/test_fc_nnpi_fp16.py b/tests/fakelowp/test_fc_nnpi_fp16.py deleted file mode 100644 index 456834c433..0000000000 --- a/tests/fakelowp/test_fc_nnpi_fp16.py +++ /dev/null @@ -1,324 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np - -import glow.fb.test.init_shared_libs # noqa -import time -from caffe2.proto import caffe2_pb2 -from caffe2.python import core -from caffe2.python import workspace -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from caffe2.python.onnx.tests.test_utils import TestCase -from glow.fb.test.test_utils import print_test_debug_info - -core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"]) - -GLOW_MATMUL_RTOL = 0 - - -class FCTest(TestCase): - def test_clip(self): - m, n, k = 8, 8, 8 - dtype = np.float32 - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", "W0", "b0", "W1", "b1"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "FC", - ["X", "W0", "b0"], - ["X1"], - ) - ) - pred_net.op.add().CopyFrom( - core.CreateOperator( - "FC", - ["X1", "W1", "b1"], - ["Y"], - ) - ) - workspace.GlobalInit( - ['caffe2', '--caffe2_log_level=0', '--glow_global_fp16=1', - '--glow_clip_fp16']) - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.ResetWorkspace() - W0 = np.full((n, k), 65536.0, dtype) - b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype) - W1 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype) - b1 = np.random.randint(low=1, high=3, size=(n)).astype(dtype) - workspace.FeedBlob("W0", W0) - workspace.FeedBlob("b0", b0) - workspace.FeedBlob("W1", W1) - workspace.FeedBlob("b1", b1) - - pred_net_onnxified = onnxifi_caffe2_net( - pred_net, - {"X": (m, k)}, - debug=True, - adjust_batch=False, - use_onnx=False - ) - - X = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) - workspace.FeedBlob("X", X) - workspace.CreateNet(pred_net_onnxified) - - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob("Y") - np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype)) - - def test_fc_exercise(self): - """ Test that the matmul engine is working, this doesn't test - precision - """ - m = np.random.randint(low=4, high=50) - k = np.random.randint(low=4, high=50) - n = np.random.randint(low=4, high=50) - dtype = np.float32 - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", "W0", "b0"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "FC", - ["X", "W0", "b0"], - ["Y"], - ) - ) - - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.ResetWorkspace() - W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype) - b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype) - workspace.FeedBlob("W0", W0) - workspace.FeedBlob("b0", b0) - - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - {"X": (m, k)}, - debug=True, - adjust_batch=False, - use_onnx=False) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) - workspace.FeedBlob("X", X0) - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(pred_net) - - num_iterations = 2 - for _ in range(num_iterations): - X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) - workspace.FeedBlob("X", X0) - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob('Y') - # Run caffe2 net - workspace.RunNet(pred_net.name) - Y_c2 = workspace.FetchBlob('Y') - if not np.allclose(Y_c2, Y_glow): - print_test_debug_info("fc", - {"m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, - "Y_glow": Y_glow, "Y_c2": Y_c2, - "diff": np.abs((Y_c2 - Y_glow) / Y_c2)}) - assert(0) - - def test_fc_numeric_cases(self): - """ Test numerics, use examples found from the unit test. - Use Fp16FCAcc16NNPI as a reference. - """ - m = 1 - k = 20 - n = 1 - dtype = np.float32 - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", "W0", "b0"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "FC", - ["X", "W0", "b0"], - ["Y"], - ) - ) - pred_net_ref = caffe2_pb2.NetDef() - pred_net_ref.name = "pred" - pred_net_ref.external_input.extend(["X", "W0", "b0"]) - pred_net_ref.external_output.append("Y") - pred_net_ref.op.add().CopyFrom( - core.CreateOperator( - "Fp16FCAcc16NNPI", - ["X", "W0", "b0"], - ["Y"], - ) - ) - - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.ResetWorkspace() - - W0 = np.array([[0.04882812, 0.21520996, 0.1027832, 0.04489136, - -0.07635498, 0.14587402, - -0.06240845, 0.3918457, 0.46362305, -0.11657715, - 0.29174805, 0.02890015, - 0.0680542, 0.4255371, -0.42895508, -0.4128418, - -0.47973633, 0.33251953, - 0.27807617, 0.3701172]], dtype=np.float32) - b0 = [0.47851562] - b0 = np.array(b0, dtype=np.float32) - - workspace.FeedBlob("W0", W0) - workspace.FeedBlob("b0", b0) - - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - {"X": (m, k)}, - debug=True, - adjust_batch=False, - use_onnx=False) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - X0 = np.random.rand(m, k).astype(dtype) - 0.5 - workspace.FeedBlob("X", X0) - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(pred_net_ref) - - X_inputs = [ - np.array([[-2.94921875e-01, -3.58642578e-01, -1.92871094e-01, - 2.81250000e-01, - -1.30126953e-01, 2.32696533e-02, -4.55566406e-01, - -2.31811523e-01, - -1.95190430e-01, -7.76977539e-02, -1.29394531e-01, - 2.94677734e-01, - 8.96453857e-04, 4.97314453e-01, -6.07604980e-02, - 2.55371094e-01, - 3.49853516e-01, -1.37695312e-01, 2.95410156e-01, - -3.67187500e-01]], dtype=np.float32), - np.array([[-0.4494629, -0.22192383, -0.1640625, 0.11480713, - -0.09851074, -0.02084351, - 0.19091797, -0.17468262, -0.47485352, 0.07489014, - 0.03897095, 0.00197601, - 0.02835083, -0.27294922, 0.26757812, -0.20996094, - -0.31103516, -0.41601562, - 0.09918213, -0.07696533]], dtype=np.float32), - np.array([[0.01150513, -0.20507812, 0.46704102, 0.00906372, - 0.19848633, 0.3720703, - 0.46557617, -0.47436523, -0.35107422, -0.0362854, - -0.20812988, 0.41918945, - 0.09716797, 0.19897461, 0.3876953, -0.0165863, - 0.23535156, 0.29956055, - 0.24389648, -0.23486328]], dtype=np.float32) - ] - - for i in range(len(X_inputs)): - workspace.FeedBlob("X", X_inputs[i]) - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob('Y') - workspace.RunNet(pred_net_ref.name) - Y_c2 = workspace.FetchBlob('Y') - - diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) - rowdiff = np.max(diff, axis=1) - - n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) - if n_offenders > 0: - print_test_debug_info("fc", - {"iter": i, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, - "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, - "rowdiff": rowdiff}) - assert(0) - - def test_fc_num0(self): - """ Test numerics, fix a dimension and determine the ranges of error. - Use Fp16FCAcc16 as a reference. - """ - np.random.seed(int(time.time())) - m = np.random.randint(low=4, high=50) - k = np.random.randint(low=4, high=1000) - n = np.random.randint(low=4, high=50) - use_packed = np.random.randint(2) - W = "W_packed" if use_packed else "W0" - dtype = np.float32 - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X", W, "b0"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "FbFCPacked" if use_packed else "FC", - ["X", W, "b0"], - ["Y"], - ) - ) - pred_net_ref = caffe2_pb2.NetDef() - pred_net_ref.name = "pred" - pred_net_ref.external_input.extend(["X", W, "b0"]) - pred_net_ref.external_output.append("Y") - pred_net_ref.op.add().CopyFrom( - core.CreateOperator( - "Fp16FCAcc16NNPI", - ["X", W, "b0"], - ["Y"], - ) - ) - - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.ResetWorkspace() - W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32) - b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32) - - workspace.FeedBlob("W0", W0) - workspace.FeedBlob("b0", b0) - workspace.RunOperatorOnce( - core.CreateOperator( - "FbGemmPack", - ['W0'], - ['W_packed'], - no_packing=True, - ) - ) - - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - {"X": (m, k)}, - debug=True, - adjust_batch=False, - use_onnx=False) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - X0 = np.random.rand(m, k).astype(dtype) - 0.5 - workspace.FeedBlob("X", X0) - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(pred_net_ref) - - num_iterations = 10 - for _ in range(num_iterations): - X0 = 100 * (np.random.rand(m, k) - 0.5).\ - astype(np.float16).astype(np.float32) - workspace.FeedBlob("X", X0) - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob('Y') - # Run caffe2 net - workspace.RunNet(pred_net_ref.name) - Y_c2 = workspace.FetchBlob('Y') - - diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) - rowdiff = np.max(diff, axis=1) - - n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) - if n_offenders > 0: - print_test_debug_info("fc", - {"iter": _, "m": m, "k": k, "n": n, "X": X0, "W0": W0, - "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, - "rowdiff": rowdiff}) - assert(0) diff --git a/tests/fakelowp/test_op_nnpi_fp16.py b/tests/fakelowp/test_op_nnpi_fp16.py deleted file mode 100644 index c5c4509e8f..0000000000 --- a/tests/fakelowp/test_op_nnpi_fp16.py +++ /dev/null @@ -1,235 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import ctypes -import numpy as np -import os - -import glow.fb.test.init_shared_libs # noqa - -import caffe2.python.hypothesis_test_util as hu -from hypothesis import given - - -from caffe2.proto import caffe2_pb2 -from caffe2.python import dyndep -from caffe2.python import core -from caffe2.python import workspace -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from caffe2.python.onnx.tests.test_utils import TestCase -from glow.fb.test.test_utils import print_test_debug_info - -core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"]) - -kEpsilon = 1e-8 - - -class ArithmeticOpsTest(TestCase): - def _test_binary_op_graph(self, name): - # First dimension is the batch size - dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) - A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) - B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) - print(A.shape, B.shape) - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["A", "B"]) - pred_net.external_output.append("C") - pred_net.op.add().CopyFrom( - core.CreateOperator( - name, - ["A", "B"], - ["C"] - ) - ) - pred_net_ref = caffe2_pb2.NetDef() - pred_net_ref.name = "ref" - pred_net_ref.external_input.extend(["A", "B"]) - pred_net_ref.external_output.append("C_ref") - pred_net_ref.op.add().CopyFrom( - core.CreateOperator( - name + "FakeFp16", - ["A", "B"], - ["C_ref"], - ) - ) - - shape_hints = {"A": A.shape, "B": B.shape} - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - shape_hints, - debug=True, - adjust_batch=True, - use_onnx=False) - print(pred_net_onnxified) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.FeedBlob("A", A) - workspace.FeedBlob("B", B) - - workspace.CreateNet(pred_net_ref) - workspace.CreateNet(pred_net_onnxified) - num_iterations = 10 - for _ in range(num_iterations): - A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) - B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) - workspace.FeedBlob("A", A) - workspace.FeedBlob("B", B) - # Run caffe2 net - workspace.RunNet(pred_net_ref.name) - Y_c2 = workspace.FetchBlob("C_ref") - - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob("C") - - # Results should be identical since we are comparing with the C2 emulation - if not np.allclose(Y_c2, Y_glow): - diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) - print_test_debug_info(name, { - "dims": dims, "A": A, "B": B, - "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff}) - assert(0) - - def test_add_graph(self): - self._test_binary_op_graph("Add") - - def test_sub_graph(self): - self._test_binary_op_graph("Sub") - - def test_mul_graph(self): - self._test_binary_op_graph("Mul") - - def test_div_graph(self): - self._test_binary_op_graph("Div") - - -class UnaryOpTest(TestCase): - def _test_unary_op(self, opname): - workspace.ResetWorkspace() - n = 1 - m = 10000 - X = np.linspace(-20, 20, num=m, dtype=np.float32) - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.append("X") - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - opname, - ['X'], - ['Y']) - ) - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.append("X") - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - opname + 'FakeFp16NNPI', - ['X'], - ['Y']) - ) - - shape_hints = {"X": (n, m)} - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - shape_hints, - debug=True, - adjust_batch=False, - use_onnx=False) - print(pred_net_onnxified) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.FeedBlob("X", X) - workspace.CreateNet(ref_net) - workspace.CreateNet(pred_net_onnxified) - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob('Y') - # Run caffe2 reference net - workspace.RunNet(ref_net.name) - Y_c2 = workspace.FetchBlob('Y') - - if not np.allclose(Y_c2, Y_glow): - diff = np.abs(Y_c2 - Y_glow) - np.save('/tmp/' + opname + 'diff', diff) - print_test_debug_info(opname, - {"X": X, - "Y_c2": Y_c2, - "Y_glow": Y_glow, - "diff": diff, - "maxdiff": np.max(diff)}) - assert(0) - - def test_sigmoid(self): - self._test_unary_op("Sigmoid") - - def test_tanh(self): - self._test_unary_op("Tanh") - - -class ReluTest(hu.HypothesisTestCase): - @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32)) - def relu_test(self, inputs, gc, dc): - X = inputs[0] - # First dimension is the batch size - print(X.shape) - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["X"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "Relu", - ["X"], - ["Y"] - ) - ) - pred_net_ref = caffe2_pb2.NetDef() - pred_net_ref.name = "ref" - pred_net_ref.external_input.extend(["X"]) - pred_net_ref.external_output.append("Y_ref") - pred_net_ref.op.add().CopyFrom( - core.CreateOperator( - "ReluFakeFp16", - ["X"], - ["Y_ref"], - ) - ) - - shape_hints = {"X": X.shape} - pred_net_onnxified = onnxifi_caffe2_net(pred_net, - shape_hints, - debug=True, - adjust_batch=True, - use_onnx=False) - print(pred_net_onnxified) - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - workspace.SwitchWorkspace("glow_test_ws", True) - workspace.FeedBlob("X", X) - - workspace.CreateNet(pred_net_ref) - workspace.CreateNet(pred_net_onnxified) - workspace.FeedBlob("X", X) - # Run caffe2 net - workspace.RunNet(pred_net_ref.name) - Y_c2 = workspace.FetchBlob("Y_ref") - - # Run Glow net - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob("Y") - - # Results should be identical since we are comparing with the C2 emulation - if not np.allclose(Y_c2, Y_glow): - diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) - print_test_debug_info("Relu", { - "X": X, - "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff}) - assert(0) diff --git a/tests/fakelowp/test_sls_4bit_nnpi_fp16.py b/tests/fakelowp/test_sls_4bit_nnpi_fp16.py deleted file mode 100644 index 4a3689eaf7..0000000000 --- a/tests/fakelowp/test_sls_4bit_nnpi_fp16.py +++ /dev/null @@ -1,219 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np -import time - -# Must happen before importing caffe2.python.* -import glow.fb.test.init_shared_libs # noqa - -from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace, dyndep -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from caffe2.python.onnx.tests.test_utils import TestCase -from glow.fb.test.test_utils import print_test_debug_info - -workspace.GlobalInit(["caffe2", "--glow_global_fp16=1", - "--glow_global_fused_scale_offset_fp16=1", - "--glow_global_force_sls_fp16_accum=1"]) - - -class SparseLengthsSumTest(TestCase): - def test_slws_fused_4bit_rowwise_all_same(self): - # Comment out for predictable debugging - seed = int(time.time()) - np.random.seed(seed) - workspace.ResetWorkspace() - n = 1 - m = 2 - data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 - - max_segments = 5 - max_segment_length = 100 - num_lengths = np.random.randint(1, max_segments + 1) - # number of segments to run - lengths = np.random.randint(0, max_segment_length + 1, - size=num_lengths).astype(np.int32) - num_indices = np.sum(lengths) - indices = np.zeros(num_indices, dtype=np.int64) - weights = np.random.uniform(low=-0.5, high=0.5, - size=[len(indices)]).astype(np.float32) - weights = np.ones(len(indices)).astype(np.float32) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"]) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused4BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"]) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused4BitRowwiseQuantized", - ['data'], - ['quantized_data'] - ) - ) - - print("quantized", workspace.FetchBlob("quantized_data")) - pred_net_onnxified = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=max_segments, - max_seq_size=max_segments * max_segment_length, - debug=True, - adjust_batch=True, - use_onnx=False - ) - - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) - np.testing.assert_equal(num_onnxified_ops, 1) - - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(ref_net) - - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob('Y') - - workspace.RunNet(ref_net.name) - Y_c2 = workspace.FetchBlob('Y') - - if not np.allclose(Y_c2, Y_glow): - print_test_debug_info( - "slws_fused_4bit_rowwise", - {"seed": seed, - "indices": indices, - "data": data, - "lengths": lengths, - "weights": weights, - "Y_c2": Y_c2, - "Y_glow": Y_glow, - "diff": Y_glow - Y_c2, - "rowwise_diff": (Y_glow - Y_c2)[:, 0]}) - assert(0) - - def test_slws_fused_4bit_rowwise(self): - # Comment out for predictable debugging - seed = int(time.time() * 1000) % 2 ** 16 - print(seed) - np.random.seed(seed) - workspace.ResetWorkspace() - - n = 20000 - DIM = 6 - data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) - - max_segments = 200 - max_segment_length = 200 - num_lengths = np.random.randint(0, max_segments + 1) - # number of segments to run - lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( - np.int32 - ) - num_indices = np.sum(lengths) - indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) - weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( - np.float32 - ) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused4BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - onnxified_net = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=max_segments, - max_seq_size=max_segments * max_segment_length, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(onnxified_net) - workspace.CreateNet(ref_net) - - workspace.RunNet(onnxified_net.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_ref = workspace.FetchBlob("Y") - - diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) - max_err = np.max(diff, axis=1) - num_offenders = (max_err > 0).sum() - if num_offenders > 0: - print_test_debug_info( - "slws_fused_4bit", - { - "indices": indices, - "data": data.shape, - "lengths": lengths, - "weights": weights, - "Y_glow": Y_glow, - "Y_ref": Y_ref, - "diff": diff, - "rowwise_diff": np.max(diff, axis=1), - }, - ) - assert 0 diff --git a/tests/fakelowp/test_sls_nnpi_fp16.py b/tests/fakelowp/test_sls_nnpi_fp16.py deleted file mode 100644 index 7e09610622..0000000000 --- a/tests/fakelowp/test_sls_nnpi_fp16.py +++ /dev/null @@ -1,616 +0,0 @@ -from __future__ import absolute_import, division, print_function, unicode_literals - -import time - -# Must happen before importing caffe2.python.* -import glow.fb.test.init_shared_libs # noqa -import numpy as np -from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace -from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net -from caffe2.python.onnx.tests.test_utils import TestCase -from glow.fb.test.test_utils import print_test_debug_info - - -workspace.GlobalInit( - [ - "caffe2", - "--glow_global_fp16=1", - "--glow_global_fused_scale_offset_fp16=1", - "--glow_global_force_sls_fp16_accum=1", - ] -) -GLOW_MATMUL_ATOL = 1e-5 -GLOW_MATMUL_RTOL = 1e-3 - - -class SparseLengthsSumTest(TestCase): - def Test_SLS_NonQuantized_fp16(self): - N = 20000 - DIM = 64 - D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32) - I = (np.random.randint(0, N, size=12)).astype(np.int64) - L = np.asarray([4, 4, 4]).astype(np.int32) - workspace.FeedBlob("D", D) - - ref_c2_net = core.Net("test_ref_c2") - ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out") - ref_c2_net.Proto().external_input.extend(["D", "I", "L"]) - ref_c2_net.Proto().external_output.extend(["ref_out"]) - - fp16_c2_net = core.Net("test_fp16_c2") - fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out") - - input_dict = {} - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend(["D", "I", "L"]) - pred_net.external_output.append("glow_out") - pred_net.op.add().CopyFrom( - core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"]) - ) - - onnxified_net = onnxifi_caffe2_net( - pred_net, - input_dict, - max_batch_size=3, - max_seq_size=16, - debug=True, - adjust_batch=False, - use_onnx=False, - ) - - num_onnxified_ops = sum( - 1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op - ) - print(onnxified_net) - np.testing.assert_equal(num_onnxified_ops, 1) - - workspace.FeedBlob("I", I) - workspace.FeedBlob("L", L) - - workspace.RunNetOnce(ref_c2_net) - ref_c2_out = workspace.FetchBlob("ref_out") - - workspace.RunNetOnce(fp16_c2_net) - fp16_c2_out = workspace.FetchBlob("fp16_out") - - np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3) - - workspace.RunNetOnce(onnxified_net) - fp16_glow_out = workspace.FetchBlob("glow_out") - - if not np.allclose(fp16_glow_out, fp16_c2_out): - diff = np.abs(fp16_glow_out - fp16_c2_out) - print_test_debug_info( - "sls", - { - "indices": I, - "data": D, - "lengths": L, - "Y_c2": fp16_c2_out, - "Y_glow": fp16_glow_out, - "diff": diff, - "rowwise_diff": diff[:, 0], - }, - ) - assert 0 - - def test_slws_fused_8bit_rowwise_all_same(self): - # Comment out for predictable debugging - np.random.seed(int(time.time())) - workspace.ResetWorkspace() - n = 1 - m = 2 - data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 - - max_segments = 5 - max_segment_length = 200 - num_lengths = np.random.randint(1, max_segments + 1) - # number of segments to run - lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype( - np.int32 - ) - num_indices = np.sum(lengths) - indices = np.zeros(num_indices, dtype=np.int64) - weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype( - np.float32 - ) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - pred_net_onnxified = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=max_segments, - max_seq_size=max_segments * max_segment_length, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - - num_onnxified_ops = sum( - 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op - ) - np.testing.assert_equal(num_onnxified_ops, 1) - - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(pred_net_onnxified) - workspace.CreateNet(ref_net) - - workspace.RunNet(pred_net_onnxified.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_c2 = workspace.FetchBlob("Y") - - if not np.allclose(Y_c2, Y_glow): - print_test_debug_info( - "slws_fused_8bit_rowwise", - { - "indices": indices, - "data": data, - "lengths": lengths, - "weights": weights, - "Y_c2": Y_c2, - "Y_glow": Y_glow, - "diff": Y_glow - Y_c2, - "rowwise_diff": (Y_glow - Y_c2)[:, 0], - }, - ) - assert 0 - - def test_slws_fused_8bit_rowwise_turkey(self): - # Comment out for predictable debugging - seed = int(time.time() * 1000) % 2 ** 16 - print(seed) - np.random.seed(seed) - workspace.ResetWorkspace() - - n = 20000 - DIM = 6 - data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) - - max_segments = 200 - max_segment_length = 200 - num_lengths = np.random.randint(0, max_segments + 1) - # number of segments to run - lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( - np.int32 - ) - num_indices = np.sum(lengths) - indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) - weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( - np.float32 - ) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - onnxified_net = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=max_segments, - max_seq_size=max_segments * max_segment_length, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(onnxified_net) - workspace.CreateNet(ref_net) - - workspace.RunNet(onnxified_net.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_ref = workspace.FetchBlob("Y") - - diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) - max_err = np.max(diff, axis=1) - num_offenders = (max_err > 0).sum() - if num_offenders > 0: - print_test_debug_info( - "slws_fused_8bit_rowwise_inv_scale", - { - "indices": indices, - "data": data.shape, - "lengths": lengths, - "weights": weights, - "Y_glow": Y_glow, - "Y_ref": Y_ref, - "diff": diff, - "rowwise_diff": np.max(diff, axis=1), - }, - ) - assert 0 - - # Simple test to aid debugging order of operations - # Minimize the case to an SLS that adds two rows - def test_small_sls(self): - seed = int(time.time() * 1000) % 2 ** 16 - print(seed) - np.random.seed(seed) - workspace.ResetWorkspace() - - n = 2 - DIM = 3 - data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) - - lengths = np.array([n], dtype=np.int32) - indices = np.array(range(n), dtype=np.int64) - weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - - quantized_data = workspace.FetchBlob("quantized_data") - - onnxified_net = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=1, - max_seq_size=n, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(onnxified_net) - workspace.CreateNet(ref_net) - - workspace.RunNet(onnxified_net.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_ref = workspace.FetchBlob("Y") - - diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) - max_err = np.max(diff, axis=1) - num_offenders = (max_err > 0).sum() - if num_offenders > 0: - np.set_printoptions(precision=12) - print( - "ref", - Y_ref.astype(np.float16).astype(np.float32), - "glow", - Y_glow.astype(np.float16).astype(np.float32), - ) - print_test_debug_info( - "slws_fused_8bit_rowwise_inv_scale", - { - "seed": seed, - "indices": indices, - "data": data, - "quantized_data": quantized_data, - "lengths": lengths, - "weights": weights, - "Y_glow": Y_glow, - "Y_ref": Y_ref, - "diff": diff, - "rowwise_diff": np.max(diff, axis=1), - }, - ) - assert 0 - - def test_small_sls_acc32(self): - - workspace.GlobalInit( - [ - "caffe2", - "--glow_global_fp16=0", - "--glow_global_fused_scale_offset_fp16=0", - "--glow_global_force_sls_fp16_accum=0", - ] - ) - seed = int(time.time() * 1000) % 2 ** 16 - print(seed) - np.random.seed(seed) - workspace.ResetWorkspace() - - n = 2 - DIM = 3 - data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) - - lengths = np.array([n], dtype=np.int32) - indices = np.array(range(n), dtype=np.int64) - weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - - quantized_data = workspace.FetchBlob("quantized_data") - - onnxified_net = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=1, - max_seq_size=n, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(onnxified_net) - workspace.CreateNet(ref_net) - - workspace.RunNet(onnxified_net.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_ref = workspace.FetchBlob("Y") - - diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) - max_err = np.max(diff, axis=1) - num_offenders = (max_err > 0).sum() - if num_offenders > 0: - np.set_printoptions(precision=12) - print( - "ref", - Y_ref.astype(np.float16).astype(np.float32), - "glow", - Y_glow.astype(np.float16).astype(np.float32), - ) - print_test_debug_info( - "test_small_sls_acc32", - { - "seed": seed, - "indices": indices, - "data": data, - "quantized_data": quantized_data, - "lengths": lengths, - "weights": weights, - "Y_glow": Y_glow, - "Y_ref": Y_ref, - "diff": diff, - "rowwise_diff": np.max(diff, axis=1), - }, - ) - assert 0 - - def test_slws_fused_8bit_rowwise_acc32_nnpi(self): - workspace.GlobalInit( - [ - "caffe2", - "--glow_global_fp16=0", - "--glow_global_fused_scale_offset_fp16=0", - "--glow_global_force_sls_fp16_accum=0", - ] - ) - # Comment out for predictable debugging - seed = int(time.time() * 1000) % 2 ** 16 - print(seed) - np.random.seed(seed) - workspace.ResetWorkspace() - - n = 20000 - DIM = 6 - data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) - - max_segments = 200 - max_segment_length = 200 - num_lengths = np.random.randint(0, max_segments + 1) - # number of segments to run - lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( - np.int32 - ) - num_indices = np.sum(lengths) - indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) - weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( - np.float32 - ) - - pred_net = caffe2_pb2.NetDef() - pred_net.name = "pred" - pred_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - pred_net.external_output.append("Y") - pred_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwise", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - ref_net = caffe2_pb2.NetDef() - ref_net.name = "ref" - ref_net.external_input.extend( - ["quantized_data", "weights", "indices", "lengths"] - ) - ref_net.external_output.append("Y") - ref_net.op.add().CopyFrom( - core.CreateOperator( - "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", - ["quantized_data", "weights", "indices", "lengths"], - ["Y"], - ) - ) - - workspace.FeedBlob("data", data) - workspace.RunOperatorOnce( - core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] - ) - ) - onnxified_net = onnxifi_caffe2_net( - pred_net, - {}, - max_batch_size=max_segments, - max_seq_size=max_segments * max_segment_length, - debug=True, - adjust_batch=True, - use_onnx=False, - ) - workspace.FeedBlob("indices", indices) - workspace.FeedBlob("lengths", lengths) - workspace.FeedBlob("weights", weights) - - workspace.CreateNet(onnxified_net) - workspace.CreateNet(ref_net) - - workspace.RunNet(onnxified_net.name) - Y_glow = workspace.FetchBlob("Y") - - workspace.RunNet(ref_net.name) - Y_ref = workspace.FetchBlob("Y") - - diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) - max_err = np.max(diff, axis=1) - num_offenders = (max_err > 0).sum() - if num_offenders > 0: - print_test_debug_info( - "test_slws_fused_8bit_rowwise_acc32_nnpi", - { - "indices": indices, - "data": data.shape, - "lengths": lengths, - "weights": weights, - "Y_glow": Y_glow, - "Y_ref": Y_ref, - "diff": diff, - "rowwise_diff": np.max(diff, axis=1), - }, - ) - assert 0