From e2fb5b929a40da0b1a808eea7e73ee05ff7aca3d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 11 Sep 2024 16:17:55 -0700 Subject: [PATCH] [ExecuTorch] Remove unneeded FFHT files We are just using FFHT to generate a kernel, so we don't need the Python wrapper. Differential Revision: [D60194971](https://our.internmc.facebook.com/intern/diff/D60194971/) [ghstack-poisoned] --- .../spinquant/third-party/FFHT/README.md | 116 +------------- .../spinquant/third-party/FFHT/_ffht_2.c | 128 ---------------- .../spinquant/third-party/FFHT/_ffht_3.c | 142 ------------------ .../third-party/FFHT/fht_header_only.h | 38 ----- .../spinquant/third-party/FFHT/setup.py | 46 ------ .../FFHT/test_double_header_only.c | 68 --------- .../third-party/FFHT/test_float_header_only.c | 68 --------- 7 files changed, 3 insertions(+), 603 deletions(-) delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c delete mode 100644 extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md index 7e00d0eca9a..dcc9840f25a 100644 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md +++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/README.md @@ -1,115 +1,5 @@ # Fast Fast Hadamard Transform -FFHT (Fast Fast Hadamard Transform) is a library that provides a heavily -optimized C99 implementation of the Fast Hadamard Transform. FFHT also provides -a thin Python wrapper that allows to perform the Fast Hadamard Transform on -one-dimensional [NumPy](http://www.numpy.org/) arrays. - -The Hadamard Transform is a linear orthogonal map defined on real vectors whose -length is a _power of two_. For the precise definition, see the -[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The -Hadamard Transform has been recently used a lot in various machine learning -and numerical algorithms. - -FFHT uses [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) -to speed up the computation. - -The header file `fht.h` exports two functions: `int fht_float(float *buf, int -log_n)` and `int fht_double(double *buf, int log_n)`. The -only difference between them is the type of vector entries. So, in what follows, -we describe how the version for floats `fht_float` works. - -The function `fht_float` takes two parameters: - -* `buf` is a pointer to the data on which one needs to perform the Fast -Hadamard Transform. -* `log_n` is the binary logarithm of the length of `buffer`. -That is, the length is equal to `2^log_n`. - -The return value is -1 if the input is invalid and is zero otherwise. - -A header-only version of the library is provided in `fht_header_only.h`. - -In addition to the Fast Hadamard Transform, we provide two auxiliary programs: -`test_float` and `test_double`, which are implemented in C99. The exhaustively -test and benchmark the library. - -FFHT has been tested on 64-bit versions of Linux, OS X and Windows (the latter -is via Cygwin). - -To install the Python package, run `python setup.py install`. The script -`example.py` shows how to use FFHT from Python. - -## Benchmarks - -Below are the times for the Fast Hadamard Transform for vectors of -various lengths. The benchmarks were run on a machine with Intel -Core i7-6700K and 2133 MHz DDR4 RAM. We compare FFHT, -[FFTW 3.3.6](http://fftw.org/), and -[fht](https://github.com/nbarbey/fht) by -[Nicolas Barbey](https://github.com/nbarbey). - -Let us stress that FFTW is a great versatile tool, and the authors of FFTW did -not try to optimize the performace of the Fast Hadamard Transform. On the other -hand, FFHT does one thing (the Fast Hadamard Transform), but does it extremely -well. - -Vector size | FFHT (float) | FFHT (double) | FFTW 3.3.6 (float) | FFTW 3.3.6 (double) | fht (float) | fht (double) -:---: | :---: | :---: | :---: | :---: | :---: | :---: -210 | 0.31 us | 0.49 us | 4.48 us | 7.72 us | 17.4 us | 19.3 us -220 | 0.68 ms | 1.39 ms | 8.81 ms | 17.07 ms | 29.8 ms | 35.0 ms -227 | 0.22 s | 0.50 s | 2.08 s | 3.57 s | 6.89 s | 7.49 s - -## Troubleshooting - -For some versions of OS X the native `clang` compiler (that mimicks `gcc`) may -not recognize the availability of AVX. A solution for this problem is to use a -genuine `gcc` (say from [Homebrew](http://brew.sh/)) or to use `-march=corei7-avx` -instead of `-march=native` for compiler flags. - -A symptom of the above happening is the undefined macros `__AVX__`. - -## Related Work - -FFHT has been created as a part of -[FALCONN](https://github.com/falconn-lib/falconn): a library for similarity -search over high-dimensional data. FALCONN's underlying algorithms are described -and analyzed in the following research paper: - -> Alexandr Andoni, Piotr Indyk, Thijs Laarhoven, Ilya Razenshteyn and Ludwig -> Schmidt, "Practical and Optimal LSH for Angular Distance", NIPS 2015, full -> version available at [arXiv:1509.02897](http://arxiv.org/abs/1509.02897) - -This is the right paper to cite, if you use FFHT for your research projects. - -## Acknowledgments - -We thank Ruslan Savchenko for useful discussions. - -Thanks to: - -* Clement Canonne -* Michal Forisek -* Rati Gelashvili -* Daniel Grier -* Dhiraj Holden -* Justin Holmgren -* Aleksandar Ivanovic -* Vladislav Isenbaev -* Jacob Kogler -* Ilya Kornakov -* Anton Lapshin -* Rio LaVigne -* Oleg Martynov -* Linar Mikeev -* Cameron Musco -* Sam Park -* Sunoo Park -* Amelia Perry -* Andrew Sabisch -* Abhishek Sarkar -* Ruslan Savchenko -* Vadim Semenov -* Arman Yessenamanov - -for helping us with testing FFHT. +This directory contains a fork of https://github.com/FALCONN-LIB/FFHT +(License: https://github.com/FALCONN-LIB/FFHT/blob/master/LICENSE.md) +focused on ARM64 NEON code generation. diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c deleted file mode 100644 index 2041e5eedec..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_2.c +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include "fht.h" - -#define UNUSED(x) (void)(x) - -static char module_docstring[] = - "A C extension that computes the Fast Hadamard Transform"; -static char fht_docstring[] = - "Compute the Fast Hadamard Transform (FHT) for a given " - "one-dimensional NumPy array.\n\n" - "The Hadamard Transform is a linear orthogonal map defined on real vectors " - "whose length is a _power of two_. For the precise definition, see the " - "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The " - "Hadamard Transform has been recently used a lot in various machine " - "learning " - "and numerical algorithms.\n\n" - "The implementation uses " - "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) " - "to speed up the computation. If AVX is not supported on your machine, " - "a simpler implementation without (explicit) vectorization is used.\n\n" - "The function takes two parameters:\n\n" - "* `buffer` is a NumPy array which is being transformed. It must be a " - "one-dimensional array with `dtype` equal to `float32` or `float64` (the " - "former is recommended unless you need high accuracy) and of size being a " - "power " - "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 " - "bytes. " - "To allocate such an aligned buffer, use the function `created_aligned` " - "from this " - "module.\n" - "* `chunk` is a positive integer that controls when the implementation " - "switches " - "from recursive to iterative algorithm. The overall algorithm is " - "recursive, but as " - "soon as the vector becomes no longer than `chunk`, the iterative " - "algorithm is " - "invoked. For technical reasons, `chunk` must be at least 8. A good choice " - "is to " - "set `chunk` to 1024. But to fine-tune the performance one should use a " - "program " - "`best_chunk` supplied with the library.\n"; - -static PyObject *ffht_fht(PyObject *self, PyObject *args); - -static PyMethodDef module_methods[] = { - {"fht", ffht_fht, METH_VARARGS, fht_docstring}, {NULL, NULL, 0, NULL}}; - -PyMODINIT_FUNC initffht(void); - -PyMODINIT_FUNC initffht(void) { - PyObject *m = Py_InitModule3("ffht", module_methods, module_docstring); - if (!m) return; - - import_array(); -} - -static PyObject *ffht_fht(PyObject *self, PyObject *args) { - UNUSED(self); - - PyObject *buffer_obj; - - if (!PyArg_ParseTuple(args, "O", &buffer_obj)) { - return NULL; - } - - PyArray_Descr *dtype; - int ndim; - npy_intp dims[NPY_MAXDIMS]; - PyArrayObject *arr = NULL; - - if (PyArray_GetArrayParamsFromObject(buffer_obj, NULL, 1, &dtype, &ndim, dims, - &arr, NULL) < 0) { - return NULL; - } - - if (arr == NULL) { - PyErr_SetString(PyExc_TypeError, "not a numpy array"); - return NULL; - } - - dtype = PyArray_DESCR(arr); - - if (dtype->type_num != NPY_FLOAT && dtype->type_num != NPY_DOUBLE) { - PyErr_SetString(PyExc_TypeError, "array must consist of floats or doubles"); - Py_DECREF(arr); - return NULL; - } - - if (PyArray_NDIM(arr) != 1) { - PyErr_SetString(PyExc_TypeError, "array must be one-dimensional"); - Py_DECREF(arr); - return NULL; - } - - int n = PyArray_DIM(arr, 0); - - if (n == 0 || (n & (n - 1))) { - PyErr_SetString(PyExc_ValueError, "array's length must be a power of two"); - Py_DECREF(arr); - return NULL; - } - - int log_n = 0; - while ((1 << log_n) < n) { - ++log_n; - } - - void *raw_buffer = PyArray_DATA(arr); - int res; - if (dtype->type_num == NPY_FLOAT) { - float *buffer = (float *)raw_buffer; - res = fht_float(buffer, log_n); - } else { - double *buffer = (double *)raw_buffer; - res = fht_double(buffer, log_n); - } - - if (res) { - PyErr_SetString(PyExc_RuntimeError, "FHT did not work properly"); - Py_DECREF(arr); - return NULL; - } - - Py_DECREF(arr); - - return Py_BuildValue(""); -} \ No newline at end of file diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c deleted file mode 100644 index 1afe8013e46..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/_ffht_3.c +++ /dev/null @@ -1,142 +0,0 @@ -#include -#include -#include "fht.h" - -#define UNUSED(x) (void)(x) - -static char module_docstring[] = - "A C extension that computes the Fast Hadamard Transform"; -static char fht_docstring[] = - "Compute the Fast Hadamard Transform (FHT) for a given " - "one-dimensional NumPy array.\n\n" - "The Hadamard Transform is a linear orthogonal map defined on real vectors " - "whose length is a _power of two_. For the precise definition, see the " - "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The " - "Hadamard Transform has been recently used a lot in various machine " - "learning " - "and numerical algorithms.\n\n" - "The implementation uses " - "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) " - "to speed up the computation. If AVX is not supported on your machine, " - "a simpler implementation without (explicit) vectorization is used.\n\n" - "The function takes two parameters:\n\n" - "* `buffer` is a NumPy array which is being transformed. It must be a " - "one-dimensional array with `dtype` equal to `float32` or `float64` (the " - "former is recommended unless you need high accuracy) and of size being a " - "power " - "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 " - "bytes. " - "To allocate such an aligned buffer, use the function `created_aligned` " - "from this " - "module.\n" - "* `chunk` is a positive integer that controls when the implementation " - "switches " - "from recursive to iterative algorithm. The overall algorithm is " - "recursive, but as " - "soon as the vector becomes no longer than `chunk`, the iterative " - "algorithm is " - "invoked. For technical reasons, `chunk` must be at least 8. A good choice " - "is to " - "set `chunk` to 1024. But to fine-tune the performance one should use a " - "program " - "`best_chunk` supplied with the library.\n"; - -static PyObject *ffht_fht(PyObject *self, PyObject *args) { - UNUSED(self); - - PyObject *buffer_obj; - - if (!PyArg_ParseTuple(args, "O", &buffer_obj)) { - return NULL; - } - - PyArray_Descr *dtype; - int ndim; - npy_intp dims[NPY_MAXDIMS]; - PyArrayObject *arr = NULL; - - if (PyArray_GetArrayParamsFromObject(buffer_obj, NULL, 1, &dtype, &ndim, dims, - &arr, NULL) < 0) { - return NULL; - } - - if (arr == NULL) { - PyErr_SetString(PyExc_TypeError, "not a numpy array"); - return NULL; - } - - dtype = PyArray_DESCR(arr); - - if (dtype->type_num != NPY_FLOAT && dtype->type_num != NPY_DOUBLE) { - PyErr_SetString(PyExc_TypeError, "array must consist of floats or doubles"); - Py_DECREF(arr); - return NULL; - } - - if (PyArray_NDIM(arr) != 1) { - PyErr_SetString(PyExc_TypeError, "array must be one-dimensional"); - Py_DECREF(arr); - return NULL; - } - - int n = PyArray_DIM(arr, 0); - - if (n == 0 || (n & (n - 1))) { - PyErr_SetString(PyExc_ValueError, "array's length must be a power of two"); - Py_DECREF(arr); - return NULL; - } - - int log_n = 0; - while ((1 << log_n) < n) { - ++log_n; - } - - void *raw_buffer = PyArray_DATA(arr); - int res; - if (dtype->type_num == NPY_FLOAT) { - float *buffer = (float *)raw_buffer; - res = fht_float(buffer, log_n); - } else { - double *buffer = (double *)raw_buffer; - res = fht_double(buffer, log_n); - } - - if (res) { - PyErr_SetString(PyExc_RuntimeError, "FHT did not work properly"); - Py_DECREF(arr); - return NULL; - } - - Py_DECREF(arr); - - return Py_BuildValue(""); -} - -static PyMethodDef module_methods[] = { - {"fht", ffht_fht, METH_VARARGS, fht_docstring}, - {NULL, NULL, 0, NULL} -}; - - -static struct PyModuleDef ffhtmodule = { - PyModuleDef_HEAD_INIT, - "ffht", - module_docstring, - -1, - module_methods -}; - -PyMODINIT_FUNC PyInit_ffht(void) { - PyObject *module = PyModule_Create(&ffhtmodule); - - if (module == NULL) { - printf("NULL"); - return NULL; - } - - import_array(); - return module; -} - - diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h b/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h deleted file mode 100644 index 76ddc2557e5..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_header_only.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _FHT_H_ -#define _FHT_H_ - -#define FHT_HEADER_ONLY - -#ifdef __cplusplus -extern "C" { -#endif -int fht_float(float *buf, int log_n); -int fht_double(double *buf, int log_n); -int fht_float_oop(float *in, float *out, int log_n); -int fht_double_oop(double *in, double *out, int log_n); -#ifdef __cplusplus -} -#endif - - -#ifdef __cplusplus -static inline int fht(float *buf, int log_n) { - return fht_float(buf, log_n); -} - -static inline int fht(double *buf, int log_n) { - return fht_double(buf, log_n); -} - -static inline int fht(float *buf, float *out, int log_n) { - return fht_float_oop(buf, out, log_n); -} - -static inline int fht(double *buf, double *out, int log_n) { - return fht_double_oop(buf, out, log_n); -} -#endif // #ifdef __cplusplus - -#include "fht_impl.h" - -#endif diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py b/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py deleted file mode 100644 index f4841cb7397..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys - -try: - import pypandoc - long_description = pypandoc.convert('README.md', 'rst') -except(IOError, ImportError): - long_description = open('README.md').read() - -try: - from setuptools import setup, find_packages, Extension -except ImportError: - sys.stderr.write('Setuptools not found!\n') - raise - -try: - import numpy as np -except ImportError: - sys.stderr.write('NumPy not found!\n') - raise - -if sys.version_info[0] == 2: - arr_sources = ['_ffht_2.c', 'fht.c'] - -if sys.version_info[0] == 3: - arr_sources = ['_ffht_3.c', 'fht.c'] - -module = Extension('ffht', - sources= arr_sources, - extra_compile_args=['-march=native', '-O3', '-Wall', '-Wextra', '-pedantic', - '-Wshadow', '-Wpointer-arith', '-Wcast-qual', - '-Wstrict-prototypes', '-Wmissing-prototypes', - '-std=c99', '-DFHT_HEADER_ONLY'], - include_dirs=[np.get_include()]) - -setup(name='FFHT', - version='1.1', - author='Ilya Razenshteyn, Ludwig Schmidt', - author_email='falconn.lib@gmail.com', - url='https://github.com/FALCONN-LIB/FFHT', - description='Fast implementation of the Fast Hadamard Transform (FHT)', - long_description=long_description, - license='MIT', - keywords='fast Fourier Hadamard transform butterfly', - packages=find_packages(), - include_package_data=True, - ext_modules=[module]) diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c deleted file mode 100644 index 081dca1d560..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_double_header_only.c +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include - -#include "fht_header_only.h" - -void dumb_fht(double *buf, int log_n); -void dumb_fht(double *buf, int log_n) { - int n = 1 << log_n; - for (int i = 0; i < log_n; ++i) { - int s1 = 1 << i; - int s2 = s1 << 1; - for (int j = 0; j < n; j += s2) { - for (int k = 0; k < s1; ++k) { - double u = buf[j + k]; - double v = buf[j + k + s1]; - buf[j + k] = u + v; - buf[j + k + s1] = u - v; - } - } - } -} - -int main(void) { - srand(4057218); - for (int log_n = 1; log_n <= 30; ++log_n) { - printf("%d ", log_n); - int n = 1 << log_n; - void *buf = malloc(sizeof(double) * n + 32); - char *start = buf; - while ((size_t)start % 32 != 0) start = start + 1; - double *a = (double*)start; - double *aux = (double*)malloc(sizeof(double) * n); - for (int i = 0; i < n; ++i) { - a[i] = 1.0 - 2.0 * (rand() & 1); - aux[i] = a[i]; - } - fht_double(a, log_n); - dumb_fht(aux, log_n); - double max_error = 0.0; - for (int i = 0; i < n; ++i) { - double error = fabs(a[i] - aux[i]); - if (error > max_error) { - max_error = error; - } - } - if (max_error > 1e-5) { - printf("ERROR: %.10lf\n", max_error); - return 1; - } - for (int num_it = 10;; num_it *= 2) { - clock_t tt1 = clock(); - for (int it = 0; it < num_it; ++it) { - fht_double(a, log_n); - } - clock_t tt2 = clock(); - double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0); - if (sec >= 1.0) { - printf("%.10e\n", sec / (num_it + 0.0)); - break; - } - } - free(buf); - free(aux); - } - return 0; -} diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c b/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c deleted file mode 100644 index d069b0c6571..00000000000 --- a/extension/llm/custom_ops/spinquant/third-party/FFHT/test_float_header_only.c +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include - -#include "fht_header_only.h" - -void dumb_fht(float *buf, int log_n); -void dumb_fht(float *buf, int log_n) { - int n = 1 << log_n; - for (int i = 0; i < log_n; ++i) { - int s1 = 1 << i; - int s2 = s1 << 1; - for (int j = 0; j < n; j += s2) { - for (int k = 0; k < s1; ++k) { - float u = buf[j + k]; - float v = buf[j + k + s1]; - buf[j + k] = u + v; - buf[j + k + s1] = u - v; - } - } - } -} - -int main(void) { - srand(4057218); - for (int log_n = 1; log_n <= 30; ++log_n) { - printf("%d ", log_n); - int n = 1 << log_n; - void *buf = malloc(sizeof(float) * n + 32); - char *start = buf; - while ((size_t)start % 32 != 0) start = start + 1; - float *a = (float*)start; - float *aux = (float*)malloc(sizeof(double) * n); - for (int i = 0; i < n; ++i) { - a[i] = 1.0 - 2.0 * (rand() & 1); - aux[i] = a[i]; - } - fht_float(a, log_n); - dumb_fht(aux, log_n); - double max_error = 0.0; - for (int i = 0; i < n; ++i) { - double error = fabs(a[i] - aux[i]); - if (error > max_error) { - max_error = error; - } - } - if (max_error > 1e-5) { - printf("ERROR: %.10lf\n", max_error); - return 1; - } - for (int num_it = 10;; num_it *= 2) { - clock_t tt1 = clock(); - for (int it = 0; it < num_it; ++it) { - fht_float(a, log_n); - } - clock_t tt2 = clock(); - double sec = (tt2 - tt1) / (CLOCKS_PER_SEC + 0.0); - if (sec >= 1.0) { - printf("%.10e\n", sec / (num_it + 0.0)); - break; - } - } - free(buf); - free(aux); - } - return 0; -}