In [1]:
import annoy
annoy.__version__, dir(annoy), dir(annoy.Annoy)

('2.0.0',
 ['Annoy',
  'AnnoyIndex',
  '__builtins__',
  '__cached__',
  '__doc__',
  '__file__',
  '__loader__',
  '__name__',
  '__package__',
  '__path__',
  '__spec__',
  '__version__',
  'annoylib'],
 ['__class__',
  '__delattr__',
  '__dir__',
  '__doc__',
  '__eq__',
  '__format__',
  '__ge__',
  '__getattribute__',
  '__getstate__',
  '__gt__',
  '__hash__',
  '__init__',
  '__init_subclass__',
  '__le__',
  '__len__',
  '__lt__',
  '__ne__',
  '__new__',
  '__reduce__',
  '__reduce_ex__',
  '__repr__',
  '__setattr__',
  '__sizeof__',
  '__str__',
  '__subclasshook__',
  'add_item',
  'build',
  'deserialize',
  'f',
  'get_distance',
  'get_item_vector',
  'get_n_items',
  'get_n_trees',
  'get_nns_by_item',
  'get_nns_by_vector',
  'info',
  'load',
  'memory_usage',
  'metric',
  'on_disk_build',
  'save',
  'serialize',
  'set_seed',
  'unbuild',
  'unload',
  'verbose'])

In [2]:
import random; random.seed(0)
import pytest
from annoy import Annoy, AnnoyIndex

Annoy?

[31mInit signature:[39m Annoy(self, /, *args, **kwargs)
[31mDocstring:[39m     
Compiled with GCC/Clang. Not using AVX instructions.

High-performance approximate nearest neighbours (Annoy) C++ core.

This module is a low-level backend (``annoylib``). It exposes the
C++-powered :class:`Annoy` type. For day-to-day work, prefer the
high-level Python API in the :mod:`annoy` package:

    from annoy import Annoy, AnnoyIndex
[31mFile:[39m           /work/.git_clones/annoy/annoy/__init__.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [3]:
# =============================================================
# 1. Construction
# =============================================================
idx = AnnoyIndex(0)
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)
print(idx)
print(idx.info())

Index dimension: 0
Metric         : None
Annoy(f=0, metric='unknown', n_items=0, n_trees=0, on_disk_path=None)
{'dimension': 0, 'metric': '', 'n_items': 0, 'n_trees': 0, 'memory_usage_bytes': 0, 'memory_usage_mib': 0.0, 'on_disk_path': None}


In [4]:
    import numpy as np
    f = 3
    i = AnnoyIndex(f, "dot")
    for j in range(100000):
        i.add_item(j, np.random.normal(size=f))
    i.build(10)
    indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
    # assert max(dists) <= 2.0
    # assert min(dists) == pytest.approx(0.0, rel=1e-11, abs=1e-11)

In [5]:
i.get_nns_by_item(0, 1, include_distances=True)

([24008], [10.789156913757324])

In [6]:
# =============================================================
# 1. Construction
# =============================================================
idx = AnnoyIndex(f=3)
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)
print(idx)

Index dimension: 3
Metric         : angular
Annoy(f=3, metric='angular', n_items=0, n_trees=0, on_disk_path=None)


  idx = AnnoyIndex(f=3)


In [7]:
# =============================================================
# 1. Construction
# =============================================================
idx = AnnoyIndex(f=3, metric="angular")
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)

Index dimension: 3
Metric         : angular


In [8]:
# =============================================================
# 2. Add items
# =============================================================
idx.add_item(0, [1, 0, 0])
idx.add_item(1, [0, 1, 0])
idx.add_item(2, [0, 0, 1])

print("Number of items:", idx.get_n_items())
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)

Number of items: 3
Index dimension: 3
Metric         : angular


In [9]:
# =============================================================
# 1. Construction
# =============================================================
idx = AnnoyIndex(100, metric="angular")
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)
idx.on_disk_build("annoy_test.annoy")

Index dimension: 100
Metric         : angular


Annoy(f=100, metric='angular', n_items=0, n_trees=0, on_disk_path=annoy_test.annoy)

In [10]:
# =============================================================
# 2. Add items
# =============================================================
f=100
n=1000
for i in range(n):
    if(i % (n//10) == 0): print(f"{i} / {n} = {1.0 * i / n}")
    # v = []
    # for z in range(f):
    #     v.append(random.gauss(0, 1))
    v = [random.gauss(0, 1) for _ in range(f)]
    idx.add_item(i, v)

print("Number of items:", idx.get_n_items())
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)
print(idx)

0 / 1000 = 0.0
100 / 1000 = 0.1
200 / 1000 = 0.2
300 / 1000 = 0.3
400 / 1000 = 0.4
500 / 1000 = 0.5
600 / 1000 = 0.6
700 / 1000 = 0.7
800 / 1000 = 0.8
900 / 1000 = 0.9
Number of items: 1000
Index dimension: 100
Metric         : angular
Annoy(f=100, metric='angular', n_items=1000, n_trees=0, on_disk_path=annoy_test.annoy)


In [11]:
# =============================================================
# 3. Build index
# =============================================================
idx.build(10)
print("Trees:", idx.get_n_trees())
print("Memory usage:", idx.memory_usage(), "bytes")
print(idx)
print(idx.info())

Trees: 10
Memory usage: 543900 bytes
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=annoy_test.annoy)
{'dimension': 100, 'metric': 'angular', 'n_items': 1000, 'n_trees': 10, 'memory_usage_bytes': 543900, 'memory_usage_mib': 0.5187034606933594, 'on_disk_path': 'annoy_test.annoy'}


In [12]:
# =============================================================
# 1. Construction
# =============================================================
idx = AnnoyIndex(0, metric="angular")
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)

Index dimension: 0
Metric         : angular


In [13]:
# =============================================================
# 2. Add items
# =============================================================
f=100
n=1000
for i in range(n):
    if(i % (n//10) == 0): print(f"{i} / {n} = {1.0 * i / n}")
    # v = []
    # for z in range(f):
    #     v.append(random.gauss(0, 1))
    v = [random.gauss(0, 1) for _ in range(f)]
    idx.add_item(i, v)

print("Number of items:", idx.get_n_items())
print("Index dimension:", idx.f)
print("Metric         :", idx.metric)
print(idx)

0 / 1000 = 0.0
100 / 1000 = 0.1
200 / 1000 = 0.2
300 / 1000 = 0.3
400 / 1000 = 0.4
500 / 1000 = 0.5
600 / 1000 = 0.6
700 / 1000 = 0.7
800 / 1000 = 0.8
900 / 1000 = 0.9
Number of items: 1000
Index dimension: 100
Metric         : angular
Annoy(f=100, metric='angular', n_items=1000, n_trees=0, on_disk_path=None)


In [14]:
# =============================================================
# 3. Build index
# =============================================================
idx.build(10)
print("Trees:", idx.get_n_trees())
print("Memory usage:", idx.memory_usage(), "bytes")
print(idx)
print(idx.info())

Trees: 10
Memory usage: 611056 bytes
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=None)
{'dimension': 100, 'metric': 'angular', 'n_items': 1000, 'n_trees': 10, 'memory_usage_bytes': 611056, 'memory_usage_mib': 0.5827484130859375, 'on_disk_path': None}


In [15]:
# =============================================================
# 4. Query â€” return NNSResult
# =============================================================
res = idx.get_nns_by_item(
    0,
    5,
    # search_k = -1,
    include_distances=True,
)

print(res)

([0, 183, 293, 47, 715], [0.0, 1.1197848320007324, 1.201889991760254, 1.2287578582763672, 1.2694454193115234])


In [16]:
# =============================================================
# 8. Query using vector
# =============================================================
res2 = idx.get_nns_by_vector(
    [random.gauss(0, 1) for _ in range(f)],
    5,
    include_distances=True
)
print("\nQuery by vector:", res2)


Query by vector: ([336, 741, 115, 718, 264], [1.2275598049163818, 1.2288236618041992, 1.2435855865478516, 1.2765949964523315, 1.2885053157806396])


In [17]:
# =============================================================
# 9. Low-level (non-result) mode
# =============================================================
items = idx.get_nns_by_item(0, 2, include_distances=False)
print("\nLow-level items only:", items)

items_low, d_low = idx.get_nns_by_item(0, 2, include_distances=True)
print("Low-level tuple return:", items_low, d_low)


Low-level items only: [0, 183]
Low-level tuple return: [0, 183] [0.0, 1.1197848320007324]


In [18]:
# =============================================================
# 10. Persistence
# =============================================================
print("\n=== Saving with binary annoy ===")
print(idx)
idx.save("annoy_test.annoy")
print(idx)

print("Loading...")
idx2 = AnnoyIndex(100, metric='angular').load("annoy_test.annoy")
print("Loaded index:", idx2)


=== Saving with binary annoy ===
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=None)
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=annoy_test.annoy)
Loading...
Loaded index: Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=annoy_test.annoy)


In [19]:
# =============================================================
# 11. Raw serialize / deserialize
# =============================================================
print("\n=== Raw serialize ===")
buf = idx.serialize()
new_idx = AnnoyIndex(100, metric='angular').deserialize(buf)
print("Deserialized index n_items:", new_idx.get_n_items())
print(idx)
print(new_idx)


=== Raw serialize ===
Deserialized index n_items: 1000
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=annoy_test.annoy)
Annoy(f=100, metric='angular', n_items=1000, n_trees=10, on_disk_path=None)


In [20]:
idx.unload()
print(idx)

Annoy(f=100, metric='angular', n_items=0, n_trees=0, on_disk_path=None)


In [21]:
idx.unbuild()
print(idx)

Annoy(f=100, metric='angular', n_items=0, n_trees=0, on_disk_path=None)


In [22]:

f = 10
idx = AnnoyIndex(f, "angular")

# Distinct non-zero content so we can see mismatches clearly
for i in range(20):
    idx.add_item(i, [float(i)] * f)
idx.build(10)
idx.save("tmp_current.tree")

u = idx.get_item_vector(15)

j = AnnoyIndex(f, "angular")
j.load("tmp_current.tree")
v = j.get_item_vector(15)

print("idx.f:", idx.f, "j.f:", j.f)
print("len(u):", len(u), "len(v):", len(v))
print("u:", u)
print("v:", v)
print("u == v:", u == v)

idx.f: 10 j.f: 10
len(u): 10 len(v): 10
u: [15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0]
v: [15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0]
u == v: True


In [24]:
import os

# 1. Recreate a fresh index with the current build
f = 10
idx = AnnoyIndex(f, "angular")
for i in range(20):
    idx.add_item(i, [0.0]*f)
idx.build(10)
idx.save("tmp_current.tree")
u = idx.get_item_vector(15)

print("tmp_current.tree size:", os.path.getsize("tmp_current.tree"))
print("test.tree size  :", os.path.getsize("../tests/test.tree"))
u


tmp_current.tree size: 3432
test.tree size  : 18824


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [25]:

# quick sanity load:
j = AnnoyIndex(f, "angular")
j.load("tmp_current.tree")  # this should succeed if build is OK
j.get_item_vector(15)       # should also work
v = j.get_item_vector(15)
v

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [26]:
u = idx.get_item_vector(15)
v = j.get_item_vector(15)

u, v, u==v

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 True)

In [28]:
import random
from annoy import AnnoyIndex
from pathlib import Path

random.seed(0)

HERE = Path.cwd().resolve()
OUT = HERE / "../tests" / "test_v2.tree"

f = 10
n = 1000
idx = AnnoyIndex(f, "angular")
for i in range(n):
    idx.add_item(i, [random.gauss(0, 1) for _ in range(f)])

idx.build(10)
idx.save(str(OUT))
print("Wrote", OUT)

Wrote /work/scikitplot/cexternals/annoy/examples/../tests/test_v2.tree


In [29]:
idx.get_nns_by_item(0, 10)

[0, 736, 940, 348, 63, 798, 235, 56, 473, 679]

In [30]:
idx.info()

{'dimension': 10,
 'metric': 'angular',
 'n_items': 1000,
 'n_trees': 10,
 'memory_usage_bytes': 60,
 'memory_usage_mib': 5.7220458984375e-05,
 'on_disk_path': '/work/scikitplot/cexternals/annoy/examples/../tests/test_v2.tree'}