pytorch · kimishpatel · Jan 26, 2021 · Jan 26, 2021 · Jan 27, 2021 · Jan 29, 2021
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-sse2.c
@@ -11,9 +11,7 @@
 #include <qnnpack/q8gemm_sparse.h>
 #include <requantization/runtime-sse2.h>
 
-#define MR 8
-#define COL_BLOCK_SIZE 4
-#define PACKED_A_BLOCK_SIZE COL_BLOCK_SIZE*MR
+#include "8x4c1x4-packed-sse2.h"
 
 // This is a super slow kernel in that it does not use intrinsics to
 // tranpose. Since this is for x86 we are not optimizing it.

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.c
@@ -11,9 +11,7 @@
 #include <qnnpack/q8gemm_sparse.h>
 #include <requantization/runtime-sse2.h>
 
-#define MR 8
-#define COL_BLOCK_SIZE 4
-#define PACKED_A_BLOCK_SIZE COL_BLOCK_SIZE*MR
+#include "8x4c1x4-packed-sse2.h"
 
 #define CONVERT_TO_FP_AND_TRANSPOSE(a, b, c, d, t_a, t_b, t_c, t_d)  \
   a_ps = _mm_cvtepi32_ps(a);                                         \

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-packed-sse2.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-packed-sse2.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#define MR 8
+#define COL_BLOCK_SIZE 4
+#define PACKED_A_BLOCK_SIZE COL_BLOCK_SIZE*MR
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x4-packA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x4-packA-aarch32-neon.S
@@ -0,0 +1,3 @@
+#if defined(__arm__)
+#include <q8gemm_sparse/4x4-packA-aarch32-neon.S>
+#endif /* defined(__arm__) */
diff --git a/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S b/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
@@ -0,0 +1,3 @@
+#if defined(__arm__)
+#include <q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S>
+#endif /* defined(__arm__) */
diff --git a/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S b/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
@@ -0,0 +1,3 @@
+#if defined(__arm__)
+#include <q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S>
+#endif /* defined(__arm__) */
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x4-packA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x4-packA-aarch64-neon.S
@@ -0,0 +1,3 @@
+#if defined(__aarch64__)
+#include <q8gemm_sparse/8x4-packA-aarch64-neon.S>
+#endif /* defined(__arm__) */
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x4c1x4-packed-sse2.c b/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x4c1x4-packed-sse2.c
@@ -0,0 +1,4 @@
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
+#include <q8gemm_sparse/8x4-packA-sse2.c>
+#include <q8gemm_sparse/8x4c1x4-dq-packedA-sse2.c>
+#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S b/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
@@ -0,0 +1,3 @@
+#if defined(__aarch64__)
+#include <q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S>
+#endif /* defined(__arm__) */
diff --git a/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S b/...Ten/native/quantized/cpu/qnnpack/wrappers/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S
@@ -0,0 +1,3 @@
+#if defined(__aarch64__)
+#include <q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S>
+#endif /* defined(__arm__) */