In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from CustomFAISS import *
import pandas as pd
import numpy as np
import faiss

In [3]:
# data = pd.read_csv('embeddings_maths.csv', usecols=[i for i in range(2, 386)]).to_numpy()
# train_data = data[:13900, :]
# test_data = data[13900:, :]

data = np.load('embeddings1M.npy')
train_data = data[:999_950, :]
test_data = data[999_950:, :]

data.nbytes / (1024 * 1024 * 1024) #GB

5.7220458984375

In [4]:
model_path = "sentence-transformers/all-MiniLM-l6-v2"

model = HuggingFaceEmbeddings(
                            model_name=model_path,
                            model_kwargs={'device': 'cpu'},
                            )

## FAISS Baseline

In [4]:
def recall_n(X, baseline, n):
    assert X.shape == baseline.shape
    assert baseline.shape[1] >= n

    recall_n = 0
    
    for i in range(X.shape[0]):
        for j in range(n):
            if X[i,j] in baseline[i, :n]:
                recall_n += 1
    
    return recall_n / (X.shape[0] * n) * 100

def smetric_9(X, baseline):
    assert X.shape == baseline.shape
    assert baseline.shape[1] >= 9

    smetric = 0

    for i in range(X.shape[0]):
        for j in range(9):
            if baseline[i,j] in X[i, :9]:
                smetric += 9 - j
    
    return smetric / X.shape[0]

In [5]:
#%%timeit
index = faiss.IndexFlatL2(train_data.shape[1])
index.train(train_data)
index.add(train_data)

In [5]:
#%%timeit
index = faiss.IndexFlatIP(train_data.shape[1])
index.train(train_data)
index.add(train_data)

In [6]:
codes = faiss.vector_to_array(index.codes)
codes.nbytes / (1024 * 1024 * 1024) #MB

2.860879898071289

In [8]:
%%timeit
_, closest_baseline = index.search(test_data[0].reshape(1,-1), k=10)

240 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
#%%timeit
_, closest_baseline = index.search(test_data, k=10)

## Custom Flat

In [9]:
%%timeit
index = CustomIndexFlat(train_data.shape[1])
index.train(train_data)
index.add(train_data)

1.21 s ± 9.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
index = CustomIndexFlat(train_data.shape[1])
index.train(train_data)
index.add(train_data)

In [11]:
codes = index.codes
codes.nbytes / (1024 * 1024 * 1024) #GB

2.860879898071289

In [16]:
#%%timeit
_, closest_customflat = index.search(test_data[0].reshape(1,-1), k=10)

In [None]:
%%timeit
_, closest_customflat = index.search(test_data, k=10)

In [15]:
_, closest_customflat = index.search(test_data, k=10)

In [13]:
print(f'smetric@9 = {smetric_9(closest_customflat, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_customflat, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_customflat, closest_baseline, 1):.1f}')

smetric@9 = 44.96
recall@10 = 99.0
recall@1 = 100.0


## FAISS PQ

In [16]:
#%%timeit
index_pq = faiss.IndexPQ(train_data.shape[1], 48, 8)
index_pq.train(train_data)
index_pq.add(train_data)

In [17]:
codes = faiss.vector_to_array(index_pq.codes)
codes.nbytes / (1024 * 1024 * 1024) #GB

0.04470124840736389

In [18]:
%%timeit
_, closest_PQ = index_pq.search(test_data[0].reshape(1,-1), k=10)

21 ms ± 790 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
#%%timeit
_, closest_PQ = index_pq.search(test_data, k=10)

45.5 ms ± 447 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
print(f'smetric@9 = {smetric_9(closest_PQ, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_PQ, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_PQ, closest_baseline, 1):.1f}')

smetric@9 = 31.30
recall@10 = 60.8
recall@1 = 54.0


## CustomPQ

In [22]:
#%%timeit
index_custompq = CustomIndexPQ(train_data.shape[1], 48, init='random', estimator='minibatchKMeans')
index_custompq.train(train_data)
index_custompq.add(train_data)

In [24]:
codes = index_custompq.codes
codes.nbytes / (1024 * 1024 * 1024) #GB

0.04470124840736389

In [25]:
%%timeit
_, closest_PQ = index_custompq.search(test_data[0], k=10)

336 ms ± 254 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
#%%timeit
_, closest_PQ = index_custompq.search(test_data, k=10)

In [27]:
print(f'smetric@9 = {smetric_9(closest_PQ, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_PQ, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_PQ, closest_baseline, 1):.1f}')

smetric@9 = 30.34
recall@10 = 60.2
recall@1 = 46.0


## FAISS IVF

In [14]:
#%%timeit
nlist, nprobe = 256, 32
quantizer = faiss.IndexFlatL2(train_data.shape[1])  # the other index
index = faiss.IndexIVFFlat(quantizer, train_data.shape[1], nlist, faiss.METRIC_L2)
index.nprobe = nprobe

index.train(train_data)
index.add(train_data)

In [19]:
#%%timeit
nlist, nprobe = 256, 32
quantizer = faiss.IndexFlatIP(train_data.shape[1])  # the other index
index = faiss.IndexIVFFlat(quantizer, train_data.shape[1], nlist, faiss.METRIC_INNER_PRODUCT)
index.nprobe = nprobe

index.train(train_data)
index.add(train_data)

In [None]:
codes = faiss.vector_to_array(index.codes)
codes.nbytes / (1024 * 1024 * 1024) #GB

In [22]:
%%timeit
_, closest_ivf = index.search(test_data[0].reshape(1,-1), k=10)

29.2 ms ± 17.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
%%timeit
_, closest_ivf = index.search(test_data, k=10)

134 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
_, closest_ivf = index.search(test_data, k=10)

In [25]:
print(f'smetric@9 = {smetric_9(closest_ivf, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_ivf, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_ivf, closest_baseline, 1):.1f}')

smetric@9 = 43.66
recall@10 = 96.4
recall@1 = 96.0


## Custom IVF

In [16]:
import cProfile

In [10]:
#%%timeit
index_customivf = CustomOptimizedIndexIVF(d=train_data.shape[1], nlist=256, nprobe=32, pca_dim=384, init='random', estimator='minibatchKMeans')
index_customivf.train(train_data)
index_customivf.add(train_data)

In [11]:
nbytes = [x.nbytes for x in index_customivf.codes]
sum(nbytes) / (1024 * 1024 * 1024) #GB

1.4304399490356445

In [12]:
%%timeit
closest_IVF = index_customivf.search(test_data[0], k=10)

439 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
closest_IVF = index_customivf.search(test_data, k=10)

6.97 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
closest_IVF = index_customivf.search(test_data, k=10)

In [14]:
print(f'smetric@9 = {smetric_9(closest_IVF, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_IVF, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_IVF, closest_baseline, 1):.1f}')

smetric@9 = 43.86
recall@10 = 94.6
recall@1 = 94.0


## FAISS IVFPQ

In [67]:
#%%timeit
nlist, nprobe, m = 1024, 128, 384
quantizer = faiss.IndexFlatL2(train_data.shape[1])  # the other index
index = faiss.IndexIVFPQ(quantizer, train_data.shape[1], nlist, m, 8)
index.nprobe = nprobe

index.train(train_data)
index.add(train_data)



In [70]:
%%timeit
_, closest_ivf = index.search(test_data[0].reshape(1,-1), k=10)

6.09 ms ± 4.76 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [55]:
%%timeit
_, closest_ivf = index.search(test_data, k=10)

580 µs ± 83.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [68]:
_, closest_ivf = index.search(test_data, k=10)

In [69]:
print(f'smetric@9 = {smetric_9(closest_ivf, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_ivf, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_ivf, closest_baseline, 1):.1f}')

smetric@9 = 44.72
recall@10 = 98.5
recall@1 = 100.0


## Faiss PQ w/ OPQ

In [47]:
index = faiss.index_factory(train_data.shape[1], "OPQ32,PQ32")

index.train(train_data)
index.add(train_data)

In [49]:
%%timeit
_, closest_PQ = index.search(test_data[0].reshape(1,-1), k=10)

14.6 ms ± 130 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [52]:
%%timeit
_, closest_PQ = index.search(test_data, k=10)

32.4 ms ± 345 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [53]:
print(f'smetric@9 = {smetric_9(closest_PQ, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_PQ, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_PQ, closest_baseline, 1):.1f}')

smetric@9 = 31.28
recall@10 = 58.4
recall@1 = 56.0


## Custom IVF IP

In [None]:
#%%timeit
index_customivf = CustomOptimizedIndexIVF_IP(d=train_data.shape[1], nlist=256, nprobe=32, pca_dim=384, init='random', estimator='minibatchKMeans')
index_customivf.train(train_data)
index_customivf.add(train_data)

In [33]:
nbytes = [x.nbytes for x in index_customivf.codes]
sum(nbytes) / (1024 * 1024 * 1024) #GB

1.4304399490356445

In [34]:
%%timeit
closest_IVF = index_customivf.search(test_data[0], k=10)

80.2 ms ± 2.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%%timeit
closest_IVF = index_customivf.search(test_data, k=10)

5.38 s ± 6.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
closest_IVF = index_customivf.search(test_data, k=10)

In [36]:
print(f'smetric@9 = {smetric_9(closest_IVF, closest_baseline):.2f}')
print(f'recall@10 = {recall_n(closest_IVF, closest_baseline, 10):.1f}')
print(f'recall@1 = {recall_n(closest_IVF, closest_baseline, 1):.1f}')

smetric@9 = 35.30
recall@10 = 72.0
recall@1 = 66.0
