In [2]:
ls

README.md    build_index.ipynb  build_split_meta.py      [0m[01;34mmetadata[0m/
api.py       build_index.py     config_IVF1024PQ48.json  [01;34mshards[0m/
build_db.py  build_shard.py     [01;34mdb[0m/                      [01;34msplits[0m/


In [3]:
import argparse
import ray
import faiss 
import lmdb
import json
import pickle
from tqdm import tqdm
import numpy as np


In [4]:

rank = 0

print('Loading data...')
ds = ray.data.read_parquet(f'./shards/{rank}.parquet')
print('Data loaded!')
print('Data count:', ds.count())

print('Building index...')
res = faiss.StandardGpuResources()
print("starting faiss")
res.setTempMemory(1024 * 1024 * 64)
co = faiss.GpuClonerOptions()
co.useFloat16 = True
print("Using FP16")
cpu_index = faiss.index_factory(384, 'IVF1024,PQ64')
index = faiss.index_cpu_to_gpu(res, 0, cpu_index, co)
#index = faiss.IndexFlatL2(384)
train_data = ds.take(1048576)
xt = np.stack([x['embedding'] for x in train_data]).astype('float32')

Loading data...


2022-11-26 12:44:35,467	INFO worker.py:1518 -- Started a local Ray instance.
[2m[36m(_get_read_tasks pid=18175)[0m   self._metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or []
[2m[36m(_get_read_tasks pid=18175)[0m   np.array_split(self._pq_ds.pieces, parallelism),


Data loaded!
Data count: 2107181
Building index...
starting faiss
Using FP16


In [6]:
xt.shape

(1048576, 384)

In [7]:
print('Training data loaded!')
index.train(xt)
print('Training finished!')

Training data loaded!
Training finished!


In [8]:

bs = 4096
for batch in tqdm(ds.iter_batches(batch_size=bs), total=ds.count()//bs):
    xb = np.stack(batch.embedding.values).astype('float32')
    index.add(xb)
print('Index built!')


515it [02:02,  4.20it/s]                         

Index built!





RuntimeError: Error in faiss::FileIOWriter::FileIOWriter(const char*) at /home/conda/feedstock_root/build_artifacts/faiss-split_1644327822094/work/faiss/impl/io.cpp:97: Error: 'f' failed: could not open ./indexes_IVF1024PQ64/0 for writing: No such file or directory

In [9]:
    
faiss.write_index(faiss.index_gpu_to_cpu(index), f'./indexes_IVF1024PQ64/{rank}')
print('Index written!')

Index written!


In [1]:

import numpy as np

d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

import faiss                     # make faiss available

ngpus = faiss.get_num_gpus()

print("number of GPUs:", ngpus)

cpu_index = faiss.IndexFlatL2(d)

gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
    cpu_index
)

gpu_index.add(xb)              # add vectors to the index
print(gpu_index.ntotal)

k = 4                          # we want to see 4 nearest neighbors
D, I = gpu_index.search(xq, k) # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries



number of GPUs: 2
100000
[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]
