spotify · erikbern · Aug 3, 2020 · Jul 28, 2020 · Jul 28, 2020 · Jul 28, 2020
diff --git a/annoy-dev-1.rockspec b/annoy-dev-1.rockspec
@@ -44,14 +44,14 @@ build = {
         unix = {
             modules = {
                 ['annoy'] = {
-                    libraries = {"stdc++"},
+                    libraries = {"stdc++", "pthread"},
                 },
             },
         },
         mingw32 = {
             modules = {
                 ['annoy'] = {
-                    libraries = {"stdc++"},
+                    libraries = {"stdc++", "pthread"},
                 },
             },
         },

diff --git a/examples/s_compile_cpp.sh b/examples/s_compile_cpp.sh
@@ -2,6 +2,6 @@
 
 
 echo "compiling precision example..."
-cmd="g++ precision_test.cpp -o precision_test -std=c++11"
+cmd="g++ precision_test.cpp -o precision_test -std=c++11 -pthread"
 eval $cmd
 echo "Done"
diff --git a/setup.py b/setup.py
@@ -47,11 +47,11 @@
     extra_compile_args += cputune
 
 if os.name != 'nt':
-    extra_compile_args += ['-O3', '-ffast-math', '-fno-associative-math']
+    extra_compile_args += ['-std=c++11', '-O3', '-ffast-math', '-fno-associative-math']
 
 # #349: something with OS X Mojave causes libstd not to be found
 if platform.system() == 'Darwin':
-    extra_compile_args += ['-std=c++11', '-mmacosx-version-min=10.9']
+    extra_compile_args += ['-mmacosx-version-min=10.9']
     extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.9']
 
 # Manual configuration, you're on your own here.

diff --git a/src/annoygomodule.h b/src/annoygomodule.h
@@ -16,8 +16,8 @@ class AnnoyIndex {
   void addItem(int item, const float* w) {
     ptr->add_item(item, w);
   };
-  void build(int q) {
-    ptr->build(q);
+  void build(int q, int n_threads=1) {
+    ptr->build(q, n_threads);
   };
   bool save(const char* filename, bool prefault) {
     return ptr->save(filename, prefault);

diff --git a/src/annoylib.h b/src/annoylib.h
@@ -57,6 +57,9 @@ typedef signed __int64    int64_t;
 #include <algorithm>
 #include <queue>
 #include <limits>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
 
 #ifdef _MSC_VER
 // Needed for Visual Studio to disable runtime checks for mempcy
@@ -104,7 +107,6 @@ inline void set_error_from_string(char **error, const char* msg) {
 #ifndef _MSC_VER
 #define popcount __builtin_popcountll
 #else // See #293, #358
-#define isnan(x) _isnan(x)
 #define popcount cole_popcount
 #endif
 
@@ -133,6 +135,8 @@ using std::vector;
 using std::pair;
 using std::numeric_limits;
 using std::make_pair;
+using std::mutex;
+using std::thread;
 
 inline bool remap_memory_and_truncate(void** _ptr, int _fd, size_t old_size, size_t new_size) {
 #ifdef __linux__
@@ -594,8 +598,8 @@ struct DotProduct : Angular {
     // Step one: compute the norm of each vector and store that in its extra dimension (f-1)
     for (S i = 0; i < node_count; i++) {
       Node* node = get_node_ptr<S, Node>(nodes, _s, i);
-      T norm = sqrt(dot(node->v, node->v, f));
-      if (isnan(norm)) norm = 0;
+      T d = dot(node->v, node->v, f);
+      T norm = d < 0 ? 0 : sqrt(d);
       node->dot_factor = norm;
     }
 
@@ -612,9 +616,8 @@ struct DotProduct : Angular {
     for (S i = 0; i < node_count; i++) {
       Node* node = get_node_ptr<S, Node>(nodes, _s, i);
       T node_norm = node->dot_factor;
-
-      T dot_factor = sqrt(pow(max_norm, static_cast<T>(2.0)) - pow(node_norm, static_cast<T>(2.0)));
-      if (isnan(dot_factor)) dot_factor = 0;
+      T squared_norm_diff = pow(max_norm, static_cast<T>(2.0)) - pow(node_norm, static_cast<T>(2.0));
+      T dot_factor = squared_norm_diff < 0 ? 0 : sqrt(squared_norm_diff);
 
       node->dot_factor = dot_factor;
     }
@@ -811,13 +814,32 @@ struct Manhattan : Minkowski {
   }
 };
 
+class ThreadBarrier {
+public:
+  explicit ThreadBarrier(std::size_t count) : _count(count) { }
+
+  void wait() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (--_count == 0) {
+      _cv.notify_all();
+    } else {
+      _cv.wait(lock, [this] { return _count == 0; });
+    }
+  }
+
+private:
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  std::size_t _count;
+};
+
 template<typename S, typename T>
 class AnnoyIndexInterface {
  public:
   // Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL
   virtual ~AnnoyIndexInterface() {};
   virtual bool add_item(S item, const T* w, char** error=NULL) = 0;
-  virtual bool build(int q, char** error=NULL) = 0;
+  virtual bool build(int q, int n_threads=1, char** error=NULL) = 0;
   virtual bool unbuild(char** error=NULL) = 0;
   virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0;
   virtual void unload() = 0;
@@ -850,20 +872,21 @@ template<typename S, typename T, typename Distance, typename Random>
   const int _f;
   size_t _s;
   S _n_items;
-  Random _random;
   void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate
   S _n_nodes;
   S _nodes_size;
   vector<S> _roots;
   S _K;
+  bool _is_seeded;
+  int _seed;
   bool _loaded;
   bool _verbose;
   int _fd;
   bool _on_disk;
   bool _built;
 public:
 
-   AnnoyIndex(int f) : _f(f), _random() {
+   AnnoyIndex(int f) : _f(f) {
     _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node
     _verbose = false;
     _built = false;
@@ -929,7 +952,7 @@ template<typename S, typename T, typename Distance, typename Random>
     return true;
   }
 
-  bool build(int q, char** error=NULL) {
+  bool build(int q, int n_threads=1, char** error=NULL) {
     if (_loaded) {
       set_error_from_string(error, "You can't build a loaded index");
       return false;
@@ -940,23 +963,32 @@ template<typename S, typename T, typename Distance, typename Random>
       return false;
     }
 
+    if (n_threads < 1) {
+      set_error_from_string(error, "You can't build an index with less than 1 thread");
+      return false;
+    }
+
     D::template preprocess<T, S, Node>(_nodes, _s, _n_items, _f);
 
     _n_nodes = _n_items;
-    while (1) {
-      if (q == -1 && _n_nodes >= _n_items * 2)
-        break;
-      if (q != -1 && _roots.size() >= (size_t)q)
-        break;
-      if (_verbose) showUpdate("pass %zd...\n", _roots.size());
 
-      vector<S> indices;
-      for (S i = 0; i < _n_items; i++) {
-        if (_get(i)->n_descendants >= 1) // Issue #223
-          indices.push_back(i);
+    std::mutex _nodes_mutex;
+    ThreadBarrier barrier(n_threads);
+    vector<std::thread> threads(n_threads);
+    int work_per_thread = (int)floor(q / (double)n_threads);
+    int work_remainder = q % n_threads;
+    for (int i = 0; i < n_threads; i++) {
+      int trees_per_thread = -1;
+      if (q > -1) {
+        // First thread picks up the remainder of the work
+        trees_per_thread = i == 0 ? work_per_thread + work_remainder : work_per_thread;
       }
 
-      _roots.push_back(_make_tree(indices, true));
+      threads[i] = std::thread(&AnnoyIndex<S, T, D, Random>::_thread_build, this, trees_per_thread, i, std::ref(barrier), std::ref(_nodes_mutex));
+    }
+
+    for (auto& thread : threads) {
+      thread.join();
     }
 
     // Also, copy the roots into the last segment of the array
@@ -1035,6 +1067,7 @@ template<typename S, typename T, typename Distance, typename Random>
     _n_nodes = 0;
     _nodes_size = 0;
     _on_disk = false;
+    _is_seeded = false;
     _roots.clear();
   }
 
@@ -1142,7 +1175,8 @@ template<typename S, typename T, typename Distance, typename Random>
   }
 
   void set_seed(int seed) {
-    _random.set_seed(seed);
+    _is_seeded = true;
+    _seed = seed;
   }
 
 protected:
@@ -1172,7 +1206,87 @@ template<typename S, typename T, typename Distance, typename Random>
     return get_node_ptr<S, Node>(_nodes, _s, i);
   }
 
-  S _make_tree(const vector<S >& indices, bool is_root) {
+  void _thread_build(int q, int thread_idx, ThreadBarrier& barrier, std::mutex& _nodes_mutex) {
+    Random _random;
+    // Each thread needs its own seed, otherwise each thread would be building the same tree(s)
+    int seed = _is_seeded ? _seed + thread_idx : thread_idx;
+    _random.set_seed(seed);
+
+    vector<vector<Node*> > thread_trees;
+    vector<S> thread_roots;
+    while (1) {
+      if (q == -1) {
+        size_t thread_n_nodes = 0;
+        for (size_t tree_idx = 0; tree_idx < thread_trees.size(); tree_idx++) {
+          thread_n_nodes += thread_trees[tree_idx].size();
+        }
+        if (thread_n_nodes >= 2 * (size_t)_n_items) {
+          break;
+        }
+      } else {
+        if (thread_roots.size() >= (size_t)q) {
+          break;
+        }
+      }
+
+      if (_verbose) showUpdate("pass %zd...\n", thread_roots.size());
+
+      vector<S> indices;
+      for (S i = 0; i < _n_items; i++) {
+        if (_get(i)->n_descendants >= 1) // Issue #223
+          indices.push_back(i);
+      }
+
+      vector<Node*> split_nodes;
+      // Each thread is essentially pretending to build only one tree that will get inserted
+      // right after the already inserted items. Indices of split nodes start with _n_items, n_items + 1, ...
+      // We do not want to mutate the _nodes array during tree construction due to reallocation issues. That is
+      // why each thread stores the trees locally until all threads are ready to insert them into _nodes.
+      S root_node = _make_tree(indices, split_nodes, true, _random);
+      thread_roots.push_back(root_node);
+      thread_trees.push_back(split_nodes);
+    }
+
+    // Wait for all threads to finish before we can start inserting tree nodes into global _nodes array
+    barrier.wait();
+
+    _nodes_mutex.lock();
+    // When a thread wants to insert local tree nodes into global _nodes it has to stop pretending that there is
+    // going to be only one tree. Each thread has to update all split nodes children that are pointing to other split nodes
+    // because their indices will change once inserted into global _nodes.
+    for (size_t tree_idx = 0; tree_idx < thread_trees.size(); tree_idx++) {
+      vector<Node*> split_nodes = thread_trees[tree_idx];
+      // Offset from _n_items where split nodes will get inserted
+      S split_nodes_offset = _n_nodes - _n_items;
+      _allocate_size(_n_nodes + split_nodes.size());
+
+      for (size_t node_idx = 0; node_idx < split_nodes.size(); node_idx++) {
+        Node* split_node = split_nodes[node_idx];
+        bool is_root = (size_t)thread_roots[tree_idx] == (_n_items + node_idx);
+
+        // Inverted condition from _make_tree to detect split nodes
+        if ((split_node->n_descendants > _K) || (is_root && (size_t)_n_items > (size_t)_K && split_node->n_descendants > 1)) {
+          for (size_t child_idx = 0; child_idx < 2; child_idx++) {
+            // Update children offset if it is pointing to a split node
+            if (split_node->children[child_idx] >= _n_items) {
+              split_node->children[child_idx] += split_nodes_offset;
+            }
+          }
+        }
+
+        memcpy(_get(_n_nodes), split_node, _s);
+        free(split_node);
+
+        _n_nodes += 1;
+      }
+
+      thread_roots[tree_idx] += split_nodes_offset;
+    }
+    _roots.insert(_roots.end(), thread_roots.begin(), thread_roots.end());
+    _nodes_mutex.unlock();
+  }
+
+  S _make_tree(const vector<S >& indices, vector<Node* >& split_nodes, bool is_root, Random& _random) {
     // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.
     // There's some regrettable complications caused by the problem that root nodes have to be "special":
     // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have
@@ -1182,9 +1296,8 @@ template<typename S, typename T, typename Distance, typename Random>
       return indices[0];
 
     if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) {
-      _allocate_size(_n_nodes + 1);
-      S item = _n_nodes++;
-      Node* m = _get(item);
+      Node* m = (Node*)malloc(_s);
+      memset(m, 0, _s);
       m->n_descendants = is_root ? _n_items : (S)indices.size();
 
       // Using std::copy instead of a loop seems to resolve issues #3 and #13,
@@ -1193,7 +1306,9 @@ template<typename S, typename T, typename Distance, typename Random>
       // Only copy when necessary to avoid crash in MSVC 9. #293
       if (!indices.empty())
         memcpy(m->children, &indices[0], indices.size() * sizeof(S));
-      return item;
+
+      split_nodes.push_back(m);
+      return _n_items + (split_nodes.size() - 1);
     }
 
     vector<Node*> children;
@@ -1205,7 +1320,8 @@ template<typename S, typename T, typename Distance, typename Random>
     }
 
     vector<S> children_indices[2];
-    Node* m = (Node*)alloca(_s);
+    Node* m = (Node*)malloc(_s);
+    memset(m, 0, _s);
     D::create_split(children, _f, _s, _random, m);
 
     for (size_t i = 0; i < indices.size(); i++) {
@@ -1246,14 +1362,11 @@ template<typename S, typename T, typename Distance, typename Random>
     m->n_descendants = is_root ? _n_items : (S)indices.size();
     for (int side = 0; side < 2; side++) {
       // run _make_tree for the smallest child first (for cache locality)
-      m->children[side^flip] = _make_tree(children_indices[side^flip], false);
+      m->children[side^flip] = _make_tree(children_indices[side^flip], split_nodes, false, _random);
     }
 
-    _allocate_size(_n_nodes + 1);
-    S item = _n_nodes++;
-    memcpy(_get(item), m, _s);
-
-    return item;
+    split_nodes.push_back(m);
+    return _n_items + (split_nodes.size() - 1);
   }
 
   void _get_all_nns(const T* v, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {

diff --git a/src/annoyluamodule.cc b/src/annoyluamodule.cc
@@ -118,9 +118,14 @@ class LuaAnnoy {
   }
 
   static int build(lua_State* L) {
+    int nargs = lua_gettop(L);
     Impl* self = getAnnoy(L, 1);
     int n_trees = luaL_checkinteger(L, 2);
-    self->build(n_trees);
+    int n_threads = 1;
+    if (nargs >= 3) {
+      n_threads = luaL_checkinteger(L, 3);
+    }
+    self->build(n_trees, n_threads);
     lua_pushboolean(L, true);
     return 1;
   }