In [3]:
import numpy as np

In [2]:
import pandas as pd
# import sentence transformers
from sentence_transformers import SentenceTransformer

In [3]:
df = pd.read_csv('all_products.csv')

In [5]:
df.shape

(116851, 4)

In [6]:
model = SentenceTransformer('my-128dim-model')

In [7]:
embeddings = model.encode(df['name'], convert_to_tensor=False, batch_size=32,show_progress_bar=True)

Batches: 100%|██████████| 3652/3652 [11:40<00:00,  5.22it/s]


In [8]:
# save embeddings as npy
np.save('embeddings.npy', embeddings)

In [4]:
with open('embeddings.npy', 'rb') as f:
    embeddings = np.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'embeddings.npy'

In [7]:
wb = embeddings

In [8]:
import faiss

# Flat

In [11]:
d = 128
k = 10
index_flat = faiss.IndexFlatIP(d)
index_flat.add(wb)

In [12]:
%%timeit
D,I = index_flat.search(wb[2000:2001], k)

16.5 ms ± 287 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
D_flat,I_flat = index_flat.search(wb[2000:2001], k)

In [14]:
I_flat

array([[ 2000, 41967, 12450,  1878, 34100, 11666, 33053, 21798, 14495,
         3008]], dtype=int64)

In [15]:
baseline = I_flat[0].tolist()

In [27]:
faiss.write_index(index_flat,'flat.index')

# LSH

In [21]:
nbits = d*4
index = faiss.IndexLSH(d,nbits)
index.add(wb)

In [22]:
%%timeit
D,I = index.search(wb[2000:2001], k)

1.63 ms ± 64.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
D,I = index.search(wb[2000:2001], k)

In [24]:
I

array([[ 2000, 41967,  6927, 11666, 64204, 34097,  1890, 71143, 33297,
        30451]], dtype=int64)

In [25]:
np.in1d(baseline,I)

array([ True,  True, False, False, False,  True, False, False, False,
       False])

In [34]:
faiss.write_index(index,'LSH.index')

# HNSW

In [46]:
M = 16
ef_search = 8
ef_construction = 64

In [41]:
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efSearch = ef_search
index.hnsw.efConstruction = ef_construction
index.add(wb)

In [42]:
%%timeit
D,I = index.search(wb[2000:2001], k)

105 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [43]:
D,I = index.search(wb[2000:2001], k)

In [44]:
I

array([[  2000,  41967,   6927,  34097, 111101,  54558, 104947,  14495,
         41001,  15054]], dtype=int64)

In [45]:
np.in1d(baseline,I)

array([ True,  True, False, False, False, False, False, False,  True,
       False])

In [32]:
faiss.write_index(index,'HNSW.index')

# IVF

In [48]:
nlist = 128
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
index.train(wb)

In [49]:
index.add(wb)

In [64]:
index.nprobe = 3

In [65]:
%%timeit
D,I = index.search(wb[2000:2001], k)

1.79 ms ± 379 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [68]:
D,I = index.search(wb[2000:2001], k)

In [69]:
I

array([[ 2000, 41967, 12450,  1878, 34100, 11666, 33053, 21798, 14495,
         6088]], dtype=int64)

In [70]:
np.in1d(baseline,I)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False])

In [52]:
faiss.write_index(index,'IVFFlat.index')

# IVFPQ

In [98]:
m = 8
bits = 8
nlist = 256
quantizer  = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer,d,nlist,m,bits,faiss.METRIC_L2)

In [99]:
index.train(wb)
index.add(wb)
index.nprobe = 3

In [100]:
%%timeit
D,I = index.search(wb[2000:2001], k)

1.44 ms ± 289 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [101]:
D,I = index.search(wb[2000:2001], k)

In [102]:
I

array([[  2000,  41967,  90144,  64204,  54558, 111101,  41001, 114230,
         87589,   7270]], dtype=int64)

In [103]:
np.in1d(baseline,I)

array([ True,  True, False, False, False, False, False, False, False,
       False])

In [78]:
faiss.write_index(index,'IVFPQ.index')

# Flat index + PCA

#####https://ai.plainenglish.io/speeding-up-similarity-search-in-recommender-systems-using-faiss-basics-part-i-ec1b5e92c92d

In [27]:
vector_dimension = 128
reduced_dimension = 32

In [28]:
pca_matrix = faiss.PCAMatrix(vector_dimension,reduced_dimension,0,False)
sub_index = faiss.IndexFlatL2(reduced_dimension)
index = faiss.IndexPreTransform(pca_matrix,sub_index)
index.train(embeddings)
index.add(embeddings)

In [29]:
%%timeit
D,I = index.search(wb[2000:2001], 10)

4.7 ms ± 590 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
D,I = index.search(wb[2000:2001], 10)

In [31]:
I

array([[ 2000, 41967, 30284, 11666, 12108, 17925,  7034, 41001,  6927,
        64204]], dtype=int64)

In [32]:
np.in1d(baseline,I)

array([ True,  True, False, False, False,  True, False, False, False,
       False])

In [33]:
faiss.write_index(index,'PCA_Flat.index')

# IVFPQ + PCA

In [34]:
m = 8
bits = 8
nlist = 256
pca_matrix = faiss.PCAMatrix(vector_dimension,reduced_dimension,0,False)
quantizer  = faiss.IndexFlatL2(reduced_dimension)
sub_index = faiss.IndexIVFPQ(quantizer,reduced_dimension,nlist,m,bits,faiss.METRIC_L2)
index = faiss.IndexPreTransform(pca_matrix,sub_index)
index.train(embeddings)
index.add(embeddings)

In [35]:
%%timeit
D,I = index.search(wb[2000:2001], 10)

2.62 ms ± 605 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
D,I = index.search(wb[2000:2001], 10)

In [37]:
I

array([[ 2000, 41967, 11666, 30284,  7034, 17925,  9653, 41001,  1890,
        34095]], dtype=int64)

In [38]:
np.in1d(baseline,I)

array([ True,  True, False, False, False,  True, False, False, False,
       False])

In [39]:
faiss.write_index(index,'PCA_IVFPQ.index')

## Real Test

In [62]:
search = ["milton mop"]
search_vector = model.encode(search)

In [63]:
search_vector.shape

(1, 128)

In [64]:
D,I = index.search(search_vector, 10)

In [65]:
I[0]

array([111047,  61500,  74150,  43792,  77397,  68632, 110337,  31626,
       115209,  15795], dtype=int64)

In [66]:
for i  in I[0]:
    print(df.iloc[i]['name'])

milton mop
zedoary
riluzole
prochlorperazine maleate
allantoin
plexiglas
p-cymene
pestle
trichoderma harzianum
achillea millefolium


## Index in Full Glory

In [10]:
quantizer = faiss.IndexFlatL2(128)
index = faiss.IndexIVFFlat(quantizer, 128, 256)
index.train(wb)
index.add(wb)

In [11]:
index_f = faiss.index_factory(128,"IVF256,Flat")
index_f.train(wb)
index_f.add(wb)

In [12]:
k = 10
D,I = index.search(wb[1000:1001], k)
I

array([[ 1000, 11359, 39635, 34535,  6997, 39977, 56800,  5183,  1001,
        51371]], dtype=int64)

In [14]:
k = 10
D_f,I_f = index_f.search(wb[1000:1001], k)
I_f

array([[ 1000, 11359, 39635, 34535,  6997, 39977, 56800,  5183,  1001,
        51371]], dtype=int64)

## More Complex

In [16]:
d = wb.shape[1]
m = 32
nbits = 8
nlist = 256

# we initialize our OPQ and Coarse Fine Quantizer steps separately
opq = faiss.OPQMatrix(d,m)
vecs = faiss.IndexFlatL2(d)
sub_index = faiss.IndexIVFPQ(vecs,d,nlist,m,nbits)
index = faiss.IndexPreTransform(opq,sub_index)

index.train(wb)
index.add(wb)

In [18]:
k = 10
D,I = index.search(wb[1000:1001], k)
I

array([[ 1000, 11359, 56800,  6997, 34535, 39977, 39635, 59871,  1001,
        16149]], dtype=int64)

In [17]:
index_f = faiss.index_factory(d,"OPQ32,IVF256,PQ32,RFlat")
index_f.train(wb)
index_f.add(wb)

In [19]:
k = 10
D_f,I_f = index_f.search(wb[1000:1001], k)
I_f

array([[ 1000, 11359, 39635, 34535,  6997, 39977, 56800,  1001, 59871,
        16149]], dtype=int64)

In [20]:
faiss.write_index(index,'OPQ32-IVF256-PQ32-RFlat.index')

## Using HNSW as quantizer

In [27]:
D = 128
# Param of PQ
M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
# Param of IVF
nlist = 1000  # The number of cells (space partition). Typical value is sqrt(N)
# Param of HNSW
hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32

# Setup
quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)

# Train
index.train(wb)

# Add
index.add(wb)

In [25]:
index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
k = 10
D_f,I_f = index_f.search(wb[1000:1001], k)
I_f

array([[ 1000, 11359, 39635, 34535,  6997, 39977, 56800,  1001, 59871,
        16149]], dtype=int64)

In [37]:
index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
k = 10
D_f,I_f = index.search(wb[2000:2001], k)
I_f

array([[  2000,  41967,  54558,  33297,  41001, 100150,  34097,   6088,
        104947,   1878]], dtype=int64)

In [34]:
np.in1d(I,I_f)

array([ True,  True, False,  True, False,  True, False,  True, False,
       False])

In [35]:
faiss.write_index(index,'HNSW-IVFPQ.index')

In [38]:
%%timeit
D_f,I_f = index.search(wb[2000:2001], k)

909 µs ± 53.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## IVFADC Index

In [9]:
d = 128
index = faiss.index_factory(d,"IVF256,PQ32X8")
index.train(wb)
index.add(wb)

In [10]:
k = 10
D,I = index.search(wb[2000:2001], k)
I

array([[  2000,  41967, 111101, 104947,  54558,  15054,  64204,  34097,
         41001,  14495]], dtype=int64)

In [16]:
np.in1d(baseline,I)

array([ True,  True, False, False, False, False, False, False,  True,
       False])

In [17]:
%%timeit
D,I = index.search(wb[2000:2001], k)

931 µs ± 73.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:

faiss.write_index(index,'IVF256-PQ32X8.index')

## MULTI-D-ADC Index
#### Multi index , asymmetric distance Computation

. Based on IMI (inverted multi index)



. imi is Similar to ivf but split acrossseveral vector subspaces


#### WITHOUT OPQ

In [19]:
d = 128
index = faiss.index_factory(d,"IMI2x8,PQ32x8")
index.train(wb)
index.add(wb)

In [20]:
k = 10
D,I = index.search(wb[2000:2001], k)
I

array([[  2000, 104947,  34097,  54558, 111101,  14495, 100150,  33297,
         41001,  15054]], dtype=int64)

In [21]:
np.in1d(baseline,I)

array([ True, False, False, False, False, False, False, False,  True,
       False])

In [22]:
%%timeit
D,I = index.search(wb[2000:2001], k)

941 µs ± 32.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
faiss.write_index(index,'IMI2x8,PQ32x8.index')

## With OPQ

In [26]:
d = 128
index_opq = faiss.index_factory(d,"OPQ32,IMI2x8,PQ32x8")
index_opq.train(wb)
index_opq.add(wb)

In [None]:
k = 10
D,I = index.search(wb[2000:2001], k)
I

In [None]:
np.in1d(baseline,I)


In [None]:
imi = faiss.extract_index_ivf(index_opq)
imi.nprobe = 100


In [None]:
k = 10
D,I = index.search(wb[2000:2001], k)
I

In [None]:
np.in1d(baseline,I)

In [None]:
faiss.write_index(index,'OPQ32-IMI2x8-PQ32x8.index')

## IVF HNSW

In [None]:
d = 128
index_ivf_hnsw = faiss.index_factory(d,"IVF4000_HNSW32,Flat")
index_ivf_hnsw.train(wb)
index_ivf_hnsw.add(wb)

In [None]:
k = 10
D,I = index_ivf_hnsw.search(wb[2000:2001], k)
I

In [None]:
np.in1d(baseline,I)


In [None]:
ivf = faiss.extract_index_ivf(index_ivf_hnsw)
ivf.nprobe = 100

In [None]:
np.in1d(baseline,I)

In [None]:
faiss.write_index(index,'IVF4000_HNSW32-Flat.index')