diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
index 7e00d0eca9a..dcc9840f25a 100644
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md
@@ -1,115 +1,5 @@
# Fast Fast Hadamard Transform
-FFHT (Fast Fast Hadamard Transform) is a library that provides a heavily
-optimized C99 implementation of the Fast Hadamard Transform. FFHT also provides
-a thin Python wrapper that allows to perform the Fast Hadamard Transform on
-one-dimensional [NumPy](http://www.numpy.org/) arrays.
-
-The Hadamard Transform is a linear orthogonal map defined on real vectors whose
-length is a _power of two_. For the precise definition, see the
-[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The
-Hadamard Transform has been recently used a lot in various machine learning
-and numerical algorithms.
-
-FFHT uses [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
-to speed up the computation.
-
-The header file `fht.h` exports two functions: `int fht_float(float *buf, int
-log_n)` and `int fht_double(double *buf, int log_n)`. The
-only difference between them is the type of vector entries. So, in what follows,
-we describe how the version for floats `fht_float` works.
-
-The function `fht_float` takes two parameters:
-
-* `buf` is a pointer to the data on which one needs to perform the Fast
-Hadamard Transform.
-* `log_n` is the binary logarithm of the length of `buffer`.
-That is, the length is equal to `2^log_n`.
-
-The return value is -1 if the input is invalid and is zero otherwise.
-
-A header-only version of the library is provided in `fht_header_only.h`.
-
-In addition to the Fast Hadamard Transform, we provide two auxiliary programs:
-`test_float` and `test_double`, which are implemented in C99. The exhaustively
-test and benchmark the library.
-
-FFHT has been tested on 64-bit versions of Linux, OS X and Windows (the latter
-is via Cygwin).
-
-To install the Python package, run `python setup.py install`. The script
-`example.py` shows how to use FFHT from Python.
-
-## Benchmarks
-
-Below are the times for the Fast Hadamard Transform for vectors of
-various lengths. The benchmarks were run on a machine with Intel
-Core i7-6700K and 2133 MHz DDR4 RAM. We compare FFHT,
-[FFTW 3.3.6](http://fftw.org/), and
-[fht](https://github.com/nbarbey/fht) by
-[Nicolas Barbey](https://github.com/nbarbey).
-
-Let us stress that FFTW is a great versatile tool, and the authors of FFTW did
-not try to optimize the performace of the Fast Hadamard Transform. On the other
-hand, FFHT does one thing (the Fast Hadamard Transform), but does it extremely
-well.
-
-Vector size | FFHT (float) | FFHT (double) | FFTW 3.3.6 (float) | FFTW 3.3.6 (double) | fht (float) | fht (double)
-:---: | :---: | :---: | :---: | :---: | :---: | :---:
-210 | 0.31 us | 0.49 us | 4.48 us | 7.72 us | 17.4 us | 19.3 us
-220 | 0.68 ms | 1.39 ms | 8.81 ms | 17.07 ms | 29.8 ms | 35.0 ms
-227 | 0.22 s | 0.50 s | 2.08 s | 3.57 s | 6.89 s | 7.49 s
-
-## Troubleshooting
-
-For some versions of OS X the native `clang` compiler (that mimicks `gcc`) may
-not recognize the availability of AVX. A solution for this problem is to use a
-genuine `gcc` (say from [Homebrew](http://brew.sh/)) or to use `-march=corei7-avx`
-instead of `-march=native` for compiler flags.
-
-A symptom of the above happening is the undefined macros `__AVX__`.
-
-## Related Work
-
-FFHT has been created as a part of
-[FALCONN](https://github.com/falconn-lib/falconn): a library for similarity
-search over high-dimensional data. FALCONN's underlying algorithms are described
-and analyzed in the following research paper:
-
-> Alexandr Andoni, Piotr Indyk, Thijs Laarhoven, Ilya Razenshteyn and Ludwig
-> Schmidt, "Practical and Optimal LSH for Angular Distance", NIPS 2015, full
-> version available at [arXiv:1509.02897](http://arxiv.org/abs/1509.02897)
-
-This is the right paper to cite, if you use FFHT for your research projects.
-
-## Acknowledgments
-
-We thank Ruslan Savchenko for useful discussions.
-
-Thanks to:
-
-* Clement Canonne
-* Michal Forisek
-* Rati Gelashvili
-* Daniel Grier
-* Dhiraj Holden
-* Justin Holmgren
-* Aleksandar Ivanovic
-* Vladislav Isenbaev
-* Jacob Kogler
-* Ilya Kornakov
-* Anton Lapshin
-* Rio LaVigne
-* Oleg Martynov
-* Linar Mikeev
-* Cameron Musco
-* Sam Park
-* Sunoo Park
-* Amelia Perry
-* Andrew Sabisch
-* Abhishek Sarkar
-* Ruslan Savchenko
-* Vadim Semenov
-* Arman Yessenamanov
-
-for helping us with testing FFHT.
+This directory contains a fork of https://github.com/FALCONN-LIB/FFHT
+(License: https://github.com/FALCONN-LIB/FFHT/blob/master/LICENSE.md)
+focused on ARM64 NEON code generation.
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c
deleted file mode 100644
index 2041e5eedec..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include
-#include
-#include "fht.h"
-
-#define UNUSED(x) (void)(x)
-
-static char module_docstring[] =
- "A C extension that computes the Fast Hadamard Transform";
-static char fht_docstring[] =
- "Compute the Fast Hadamard Transform (FHT) for a given "
- "one-dimensional NumPy array.\n\n"
- "The Hadamard Transform is a linear orthogonal map defined on real vectors "
- "whose length is a _power of two_. For the precise definition, see the "
- "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The "
- "Hadamard Transform has been recently used a lot in various machine "
- "learning "
- "and numerical algorithms.\n\n"
- "The implementation uses "
- "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) "
- "to speed up the computation. If AVX is not supported on your machine, "
- "a simpler implementation without (explicit) vectorization is used.\n\n"
- "The function takes two parameters:\n\n"
- "* `buffer` is a NumPy array which is being transformed. It must be a "
- "one-dimensional array with `dtype` equal to `float32` or `float64` (the "
- "former is recommended unless you need high accuracy) and of size being a "
- "power "
- "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 "
- "bytes. "
- "To allocate such an aligned buffer, use the function `created_aligned` "
- "from this "
- "module.\n"
- "* `chunk` is a positive integer that controls when the implementation "
- "switches "
- "from recursive to iterative algorithm. The overall algorithm is "
- "recursive, but as "
- "soon as the vector becomes no longer than `chunk`, the iterative "
- "algorithm is "
- "invoked. For technical reasons, `chunk` must be at least 8. A good choice "
- "is to "
- "set `chunk` to 1024. But to fine-tune the performance one should use a "
- "program "
- "`best_chunk` supplied with the library.\n";
-
-static PyObject *ffht_fht(PyObject *self, PyObject *args);
-
-static PyMethodDef module_methods[] = {
- {"fht", ffht_fht, METH_VARARGS, fht_docstring}, {NULL, NULL, 0, NULL}};
-
-PyMODINIT_FUNC initffht(void);
-
-PyMODINIT_FUNC initffht(void) {
- PyObject *m = Py_InitModule3("ffht", module_methods, module_docstring);
- if (!m) return;
-
- import_array();
-}
-
-static PyObject *ffht_fht(PyObject *self, PyObject *args) {
- UNUSED(self);
-
- PyObject *buffer_obj;
-
- if (!PyArg_ParseTuple(args, "O", &buffer_obj)) {
- return NULL;
- }
-
- PyArray_Descr *dtype;
- int ndim;
- npy_intp dims[NPY_MAXDIMS];
- PyArrayObject *arr = NULL;
-
- if (PyArray_GetArrayParamsFromObject(buffer_obj, NULL, 1, &dtype, &ndim, dims,
- &arr, NULL) < 0) {
- return NULL;
- }
-
- if (arr == NULL) {
- PyErr_SetString(PyExc_TypeError, "not a numpy array");
- return NULL;
- }
-
- dtype = PyArray_DESCR(arr);
-
- if (dtype->type_num != NPY_FLOAT && dtype->type_num != NPY_DOUBLE) {
- PyErr_SetString(PyExc_TypeError, "array must consist of floats or doubles");
- Py_DECREF(arr);
- return NULL;
- }
-
- if (PyArray_NDIM(arr) != 1) {
- PyErr_SetString(PyExc_TypeError, "array must be one-dimensional");
- Py_DECREF(arr);
- return NULL;
- }
-
- int n = PyArray_DIM(arr, 0);
-
- if (n == 0 || (n & (n - 1))) {
- PyErr_SetString(PyExc_ValueError, "array's length must be a power of two");
- Py_DECREF(arr);
- return NULL;
- }
-
- int log_n = 0;
- while ((1 << log_n) < n) {
- ++log_n;
- }
-
- void *raw_buffer = PyArray_DATA(arr);
- int res;
- if (dtype->type_num == NPY_FLOAT) {
- float *buffer = (float *)raw_buffer;
- res = fht_float(buffer, log_n);
- } else {
- double *buffer = (double *)raw_buffer;
- res = fht_double(buffer, log_n);
- }
-
- if (res) {
- PyErr_SetString(PyExc_RuntimeError, "FHT did not work properly");
- Py_DECREF(arr);
- return NULL;
- }
-
- Py_DECREF(arr);
-
- return Py_BuildValue("");
-}
\ No newline at end of file
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c
deleted file mode 100644
index 1afe8013e46..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c
+++ /dev/null
@@ -1,142 +0,0 @@
-#include
-#include
-#include "fht.h"
-
-#define UNUSED(x) (void)(x)
-
-static char module_docstring[] =
- "A C extension that computes the Fast Hadamard Transform";
-static char fht_docstring[] =
- "Compute the Fast Hadamard Transform (FHT) for a given "
- "one-dimensional NumPy array.\n\n"
- "The Hadamard Transform is a linear orthogonal map defined on real vectors "
- "whose length is a _power of two_. For the precise definition, see the "
- "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The "
- "Hadamard Transform has been recently used a lot in various machine "
- "learning "
- "and numerical algorithms.\n\n"
- "The implementation uses "
- "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) "
- "to speed up the computation. If AVX is not supported on your machine, "
- "a simpler implementation without (explicit) vectorization is used.\n\n"
- "The function takes two parameters:\n\n"
- "* `buffer` is a NumPy array which is being transformed. It must be a "
- "one-dimensional array with `dtype` equal to `float32` or `float64` (the "
- "former is recommended unless you need high accuracy) and of size being a "
- "power "
- "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 "
- "bytes. "
- "To allocate such an aligned buffer, use the function `created_aligned` "
- "from this "
- "module.\n"
- "* `chunk` is a positive integer that controls when the implementation "
- "switches "
- "from recursive to iterative algorithm. The overall algorithm is "
- "recursive, but as "
- "soon as the vector becomes no longer than `chunk`, the iterative "
- "algorithm is "
- "invoked. For technical reasons, `chunk` must be at least 8. A good choice "
- "is to "
- "set `chunk` to 1024. But to fine-tune the performance one should use a "
- "program "
- "`best_chunk` supplied with the library.\n";
-
-static PyObject *ffht_fht(PyObject *self, PyObject *args) {
- UNUSED(self);
-
- PyObject *buffer_obj;
-
- if (!PyArg_ParseTuple(args, "O", &buffer_obj)) {
- return NULL;
- }
-
- PyArray_Descr *dtype;
- int ndim;
- npy_intp dims[NPY_MAXDIMS];
- PyArrayObject *arr = NULL;
-
- if (PyArray_GetArrayParamsFromObject(buffer_obj, NULL, 1, &dtype, &ndim, dims,
- &arr, NULL) < 0) {
- return NULL;
- }
-
- if (arr == NULL) {
- PyErr_SetString(PyExc_TypeError, "not a numpy array");
- return NULL;
- }
-
- dtype = PyArray_DESCR(arr);
-
- if (dtype->type_num != NPY_FLOAT && dtype->type_num != NPY_DOUBLE) {
- PyErr_SetString(PyExc_TypeError, "array must consist of floats or doubles");
- Py_DECREF(arr);
- return NULL;
- }
-
- if (PyArray_NDIM(arr) != 1) {
- PyErr_SetString(PyExc_TypeError, "array must be one-dimensional");
- Py_DECREF(arr);
- return NULL;
- }
-
- int n = PyArray_DIM(arr, 0);
-
- if (n == 0 || (n & (n - 1))) {
- PyErr_SetString(PyExc_ValueError, "array's length must be a power of two");
- Py_DECREF(arr);
- return NULL;
- }
-
- int log_n = 0;
- while ((1 << log_n) < n) {
- ++log_n;
- }
-
- void *raw_buffer = PyArray_DATA(arr);
- int res;
- if (dtype->type_num == NPY_FLOAT) {
- float *buffer = (float *)raw_buffer;
- res = fht_float(buffer, log_n);
- } else {
- double *buffer = (double *)raw_buffer;
- res = fht_double(buffer, log_n);
- }
-
- if (res) {
- PyErr_SetString(PyExc_RuntimeError, "FHT did not work properly");
- Py_DECREF(arr);
- return NULL;
- }
-
- Py_DECREF(arr);
-
- return Py_BuildValue("");
-}
-
-static PyMethodDef module_methods[] = {
- {"fht", ffht_fht, METH_VARARGS, fht_docstring},
- {NULL, NULL, 0, NULL}
-};
-
-
-static struct PyModuleDef ffhtmodule = {
- PyModuleDef_HEAD_INIT,
- "ffht",
- module_docstring,
- -1,
- module_methods
-};
-
-PyMODINIT_FUNC PyInit_ffht(void) {
- PyObject *module = PyModule_Create(&ffhtmodule);
-
- if (module == NULL) {
- printf("NULL");
- return NULL;
- }
-
- import_array();
- return module;
-}
-
-
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h
deleted file mode 100644
index 76ddc2557e5..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _FHT_H_
-#define _FHT_H_
-
-#define FHT_HEADER_ONLY
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-int fht_float(float *buf, int log_n);
-int fht_double(double *buf, int log_n);
-int fht_float_oop(float *in, float *out, int log_n);
-int fht_double_oop(double *in, double *out, int log_n);
-#ifdef __cplusplus
-}
-#endif
-
-
-#ifdef __cplusplus
-static inline int fht(float *buf, int log_n) {
- return fht_float(buf, log_n);
-}
-
-static inline int fht(double *buf, int log_n) {
- return fht_double(buf, log_n);
-}
-
-static inline int fht(float *buf, float *out, int log_n) {
- return fht_float_oop(buf, out, log_n);
-}
-
-static inline int fht(double *buf, double *out, int log_n) {
- return fht_double_oop(buf, out, log_n);
-}
-#endif // #ifdef __cplusplus
-
-#include "fht_impl.h"
-
-#endif
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py
deleted file mode 100644
index f4841cb7397..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import sys
-
-try:
- import pypandoc
- long_description = pypandoc.convert('README.md', 'rst')
-except(IOError, ImportError):
- long_description = open('README.md').read()
-
-try:
- from setuptools import setup, find_packages, Extension
-except ImportError:
- sys.stderr.write('Setuptools not found!\n')
- raise
-
-try:
- import numpy as np
-except ImportError:
- sys.stderr.write('NumPy not found!\n')
- raise
-
-if sys.version_info[0] == 2:
- arr_sources = ['_ffht_2.c', 'fht.c']
-
-if sys.version_info[0] == 3:
- arr_sources = ['_ffht_3.c', 'fht.c']
-
-module = Extension('ffht',
- sources= arr_sources,
- extra_compile_args=['-march=native', '-O3', '-Wall', '-Wextra', '-pedantic',
- '-Wshadow', '-Wpointer-arith', '-Wcast-qual',
- '-Wstrict-prototypes', '-Wmissing-prototypes',
- '-std=c99', '-DFHT_HEADER_ONLY'],
- include_dirs=[np.get_include()])
-
-setup(name='FFHT',
- version='1.1',
- author='Ilya Razenshteyn, Ludwig Schmidt',
- author_email='falconn.lib@gmail.com',
- url='https://github.com/FALCONN-LIB/FFHT',
- description='Fast implementation of the Fast Hadamard Transform (FHT)',
- long_description=long_description,
- license='MIT',
- keywords='fast Fourier Hadamard transform butterfly',
- packages=find_packages(),
- include_package_data=True,
- ext_modules=[module])
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c
deleted file mode 100644
index 081dca1d560..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c
+++ /dev/null
@@ -1,68 +0,0 @@
-#include
-#include
-#include
-#include
-
-#include "fht_header_only.h"
-
-void dumb_fht(double *buf, int log_n);
-void dumb_fht(double *buf, int log_n) {
- int n = 1 << log_n;
- for (int i = 0; i < log_n; ++i) {
- int s1 = 1 << i;
- int s2 = s1 << 1;
- for (int j = 0; j < n; j += s2) {
- for (int k = 0; k < s1; ++k) {
- double u = buf[j + k];
- double v = buf[j + k + s1];
- buf[j + k] = u + v;
- buf[j + k + s1] = u - v;
- }
- }
- }
-}
-
-int main(void) {
- srand(4057218);
- for (int log_n = 1; log_n <= 30; ++log_n) {
- printf("%d ", log_n);
- int n = 1 << log_n;
- void *buf = malloc(sizeof(double) * n + 32);
- char *start = buf;
- while ((size_t)start % 32 != 0) start = start + 1;
- double *a = (double*)start;
- double *aux = (double*)malloc(sizeof(double) * n);
- for (int i = 0; i < n; ++i) {
- a[i] = 1.0 - 2.0 * (rand() & 1);
- aux[i] = a[i];
- }
- fht_double(a, log_n);
- dumb_fht(aux, log_n);
- double max_error = 0.0;
- for (int i = 0; i < n; ++i) {
- double error = fabs(a[i] - aux[i]);
- if (error > max_error) {
- max_error = error;
- }
- }
- if (max_error > 1e-5) {
- printf("ERROR: %.10lf\n", max_error);
- return 1;
- }
- for (int num_it = 10;; num_it *= 2) {
- clock_t tt1 = clock();
- for (int it = 0; it < num_it; ++it) {
- fht_double(a, log_n);
- }
- clock_t tt2 = clock();
- double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
- if (sec >= 1.0) {
- printf("%.10e\n", sec / (num_it + 0.0));
- break;
- }
- }
- free(buf);
- free(aux);
- }
- return 0;
-}
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c
deleted file mode 100644
index d069b0c6571..00000000000
--- a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c
+++ /dev/null
@@ -1,68 +0,0 @@
-#include
-#include
-#include
-#include
-
-#include "fht_header_only.h"
-
-void dumb_fht(float *buf, int log_n);
-void dumb_fht(float *buf, int log_n) {
- int n = 1 << log_n;
- for (int i = 0; i < log_n; ++i) {
- int s1 = 1 << i;
- int s2 = s1 << 1;
- for (int j = 0; j < n; j += s2) {
- for (int k = 0; k < s1; ++k) {
- float u = buf[j + k];
- float v = buf[j + k + s1];
- buf[j + k] = u + v;
- buf[j + k + s1] = u - v;
- }
- }
- }
-}
-
-int main(void) {
- srand(4057218);
- for (int log_n = 1; log_n <= 30; ++log_n) {
- printf("%d ", log_n);
- int n = 1 << log_n;
- void *buf = malloc(sizeof(float) * n + 32);
- char *start = buf;
- while ((size_t)start % 32 != 0) start = start + 1;
- float *a = (float*)start;
- float *aux = (float*)malloc(sizeof(double) * n);
- for (int i = 0; i < n; ++i) {
- a[i] = 1.0 - 2.0 * (rand() & 1);
- aux[i] = a[i];
- }
- fht_float(a, log_n);
- dumb_fht(aux, log_n);
- double max_error = 0.0;
- for (int i = 0; i < n; ++i) {
- double error = fabs(a[i] - aux[i]);
- if (error > max_error) {
- max_error = error;
- }
- }
- if (max_error > 1e-5) {
- printf("ERROR: %.10lf\n", max_error);
- return 1;
- }
- for (int num_it = 10;; num_it *= 2) {
- clock_t tt1 = clock();
- for (int it = 0; it < num_it; ++it) {
- fht_float(a, log_n);
- }
- clock_t tt2 = clock();
- double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0);
- if (sec >= 1.0) {
- printf("%.10e\n", sec / (num_it + 0.0));
- break;
- }
- }
- free(buf);
- free(aux);
- }
- return 0;
-}