In [92]:
import faiss
from math import sqrt
import numpy as np
import pandas as pd

In [2]:
# Define parameters
dimension = 1536  # Dimension of the vectors
num_centroids = 96  # Number of centroids (clusters)
assert dimension % num_centroids == 0
code_size = 8  # Size of PQ codes
chunk_size = 10000  # Number of vectors to process in each chunk

In [4]:
def load_embeddings_in_chunks(file_path, chunk_size=30000, column="embedding"):
    """
    Load embedding vectors from a CSV file in chunks.
    """
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        embeddings = chunk[column].apply(eval).apply(lambda x: np.array(x, dtype='f'))
        yield np.vstack(embeddings)


def build_ivfpq_index(file_path, chunk_size=30000):
    """
    Build an IVFPQ index from embedding vectors loaded in chunks.
    """
    # Create the quantizer (coarse quantizer)
    quantizer = faiss.IndexFlatL2(dimension)  # The coarse quantizer
    # Create the IVF+PQ index
    index = faiss.IndexIVFPQ(quantizer, dimension, num_centroids, code_size, 8)

    # Train the index with the first chunk
    file_list = load_embeddings_in_chunks(file_path, chunk_size)
    first_chunk = next(file_list)
    print("processing chunk 1...")
    print(len(first_chunk))
    faiss.normalize_L2(first_chunk)
    index.train(first_chunk)
    index.add(first_chunk)
    count = 1
    # Add the rest of the vectors incrementally
    for embeddings in file_list:
        count += 1
        print(f"processing chunk {count}...")
        print(len(embeddings))
        faiss.normalize_L2(embeddings)
        index.add(embeddings)

    return index

In [33]:

index = build_ivfpq_index(file_path, chunk_size=500)
faiss.write_index(index, "Amazon_fine_food_IVFPQ.faiss")

processing chunk 1...
500
processing chunk 2...
499


## example -- Index

In [67]:
file_path = "embedding_data/test_5k.csv"
df = pd.read_csv(file_path)
df["embedding"] = df.embedding.apply(eval).tolist()
df



Unnamed: 0,Summary,Text,combined,embedding
0,Good Quality Dog Food,I have bought several of the Vitality canned d...,Summary: Good Quality Dog Food|Text: I have bo...,"[0.01789022423326969, 0.01164653617888689, -0...."
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Summary: Not as Advertised|Text: Product arriv...,"[-0.013836927711963654, 0.009693932719528675, ..."
2,"""Delight"" says it all",This is a confection that has been around a fe...,"Summary: ""Delight"" says it all|Text: This is a...","[0.015790143981575966, -0.019770486280322075, ..."
3,Cough Medicine,If you are looking for the secret ingredient i...,Summary: Cough Medicine|Text: If you are looki...,"[-0.018270408734679222, -0.010695390403270721,..."
4,Great taffy,Great taffy at a great price. There was a wid...,Summary: Great taffy|Text: Great taffy at a gr...,"[-0.006754104048013687, -0.03700699657201767, ..."
...,...,...,...,...
4994,AMAZING!,If you are on the paleo/caveman/primal/hunter-...,Summary: AMAZING!|Text: If you are on the pale...,"[-0.0012276368215680122, 0.021225370466709137,..."
4995,The cavemen must have been wealthy,I really wanted to like these.<br /><br />Firs...,Summary: The cavemen must have been wealthy|Te...,"[0.0017160774441435933, 0.020770234987139702, ..."
4996,These cookies need work; you can make your own...,I was not impressed with these cookies when I ...,Summary: These cookies need work; you can make...,"[0.009349729865789413, -0.01820063591003418, -..."
4997,Okay in a pinch - not great,The cookies came sealed and seem to be high qu...,Summary: Okay in a pinch - not great|Text: The...,"[-0.026412861421704292, 0.004141702316701412, ..."


In [68]:
np.array(df.embedding[:10].tolist(), dtype="f")

array([[ 0.01789022,  0.01164654, -0.00619002, ...,  0.04405766,
         0.00558473, -0.03294187],
       [-0.01383693,  0.00969393, -0.03435654, ...,  0.00583051,
         0.0412009 , -0.00510296],
       [ 0.01579014, -0.01977049, -0.06444362, ..., -0.00281394,
         0.01088397, -0.00333882],
       ...,
       [ 0.00377534, -0.01761615, -0.0901785 , ...,  0.01977709,
         0.0141283 , -0.00122659],
       [-0.03526118,  0.01750127,  0.03238741, ..., -0.00515842,
        -0.01901   , -0.00598822],
       [ 0.00113104,  0.00350257,  0.01187729, ...,  0.04012235,
        -0.00364851, -0.01442563]], dtype=float32)

In [69]:
## This is the example format of faiss index.add()

xb = np.random.random((2, dimension)).astype('float32')
xb

array([[0.4595993 , 0.45745414, 0.9122513 , ..., 0.79121876, 0.9643332 ,
        0.2556368 ],
       [0.4115985 , 0.2249185 , 0.13262852, ..., 0.02060924, 0.4014559 ,
        0.84031695]], dtype=float32)

In [70]:
dimension = len(df.embedding[0])
index = faiss.IndexFlatL2(dimension)
print(index.ntotal)
index.add(np.array(df.embedding.tolist(), dtype="f"))
print(index.ntotal)

0
4999


In [86]:
np.array(df.embedding[4], dtype='f')

array([-0.0067541 , -0.037007  , -0.07699548, ...,  0.02148498,
        0.01065749, -0.00933675], dtype=float32)

In [62]:
xb[:1]

array([[0.5877511 , 0.31046653, 0.9773711 , ..., 0.30011746, 0.01525147,
        0.4309558 ]], dtype=float32)

In [88]:
# input is a single vector
k = 2
D, I = index.search(np.array([df.embedding[1940]], dtype='f'), k)
print(I)
print(D)

[[1940 1936]]
[[0.         0.66252255]]


In [91]:
# query multiple vectors
k = 3
D, I = index.search(np.array(df.embedding[1000:1005].tolist(), dtype='f'), k)
print(I)
print(D)

[[1000 4472 4470]
 [1001 1003 1286]
 [1002 1004  646]
 [1003 1001 1286]
 [1004 1002 1005]]
[[0.         0.69506353 0.75094485]
 [0.         0.5215305  0.68433094]
 [0.         0.7168575  0.7472522 ]
 [0.         0.5215305  0.67723036]
 [0.         0.7168575  0.73714936]]


## example -- IVF Flat

Build IVF index:

In [107]:
dimension = len(df.embedding[0])
nlist = int(sqrt(df.shape[0])) # number of Voronoi cells to divide. lower this increases accuracy, decreases speed
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
assert not index.is_trained

index.train(np.array(df.embedding.tolist(), dtype="f"))
assert index.is_trained

index.add(np.array(df.embedding.tolist(), dtype="f"))
print(index.ntotal)


4999


search IVF index: 

In [108]:
# input is a single vector
k = 2
D, I = index.search(np.array([df.embedding[1940]], dtype='f'), k)
print(I)
print(D)

[[1940 1936]]
[[0.         0.66252255]]


In [109]:
# query multiple vectors
k = 3
D, I = index.search(np.array(df.embedding[1000:1005].tolist(), dtype='f'), k)
print(I)
print(D)

[[1000 3135 3128]
 [1001 1003 1286]
 [1002 1004  646]
 [1003 1001 1286]
 [1004 1002 1005]]
[[0.         0.85312843 0.94472206]
 [0.         0.5215305  0.68433094]
 [0.         0.7168575  0.7472522 ]
 [0.         0.5215305  0.67723036]
 [0.         0.7168575  0.73714936]]


In [111]:
faiss.write_index(index, "IVF_index.bin")

## example - IVFPQ

IVFPQ compress vectors using Product Quantization (PQ) thus memory efficient

In [103]:
dimension = len(df.embedding[0])
nlist = int(sqrt(df.shape[0])) # number of Voronoi cells to divide. lower this increases accuracy, decreases speed
quantizer = faiss.IndexFlatL2(dimension)
nsubvec = 96
assert dimension % nsubvec == 0
nbits = 8

index = faiss.IndexIVFPQ(quantizer, dimension, nlist, nsubvec, nbits)
assert not index.is_trained

index.train(np.array(df.embedding.tolist(), dtype="f"))
assert index.is_trained

index.add(np.array(df.embedding.tolist(), dtype="f"))
print(index.ntotal)

4999


search with IVFPQ index

In [104]:
# input is a single vector
k = 2
D, I = index.search(np.array([df.embedding[1940]], dtype='f'), k)
print(I)
print(D)

[[1940 1936]]
[[0.28868228 0.64126396]]


In [105]:
# query multiple vectors
k = 3
D, I = index.search(np.array(df.embedding[1000:1005].tolist(), dtype='f'), k)
print(I)
print(D)

[[1000 3135 1198]
 [1001 1003 1293]
 [1002 1004 1254]
 [1003 1001 1293]
 [1004 1002 1278]]
[[0.2870647  0.7258632  0.77676415]
 [0.21828422 0.4783349  0.5755052 ]
 [0.22325715 0.5751923  0.6141427 ]
 [0.22571589 0.48206976 0.5817538 ]
 [0.23687881 0.5991674  0.6275588 ]]


In [106]:
faiss.write_index(index, "IVFPQ_index.bin")

## Resvior sampling

If the data too large to fit into RAM, resvior sampling can be used to sample from large files to represent the whole data. This sample will be feed to `index.train` before indexing for clustering. 

In [None]:
def reservoir_sampling(src, nsample, temp_fac=1.5, rs=None): 
    """
    samples nsample vectors from an iterator src that yields matrices
    nsample * temp_fac is the max size of the temporary buffer.
    rs is a RandomState object   
    """
    if rs is None: 
        rs = np.random
    maxsize = int(nsample * temp_fac)
    
    reservoir = []      # represented as a list of subsampled matrices
    nreservoir = 0      # size of the reservoir
    nseen = 0           # number of vectors seen so far 
    threshold = 1.0     # probability for a vector to be included in the reservoir
    
    for mat in src:
        n = len(mat)
        
        if nseen + n < maxsize: 
            # so far, no need to sub-sample
            reservoir.append(mat)
            nreservoir += n
        else: 
            # sample from the input matrix
            mask = rs.rand(n) < threshold
            mat_sampled = mat[mask]
            # add to reservoir
            nreservoir += len(mat_sampled)
            reservoir.append(mat_sampled)
            
            if nreservoir > maxsize: 
                # resamlpe reservoir to nsample
                reservoir = np.vstack(reservoir)
                idx = rs.choice(nreservoir, size=nsample, replace=False)
                reservoir = [reservoir[idx]]
                nreservoir = nsample
                # update threshold
                threshold = nsample / (nseen + n)
            
        nseen += n
    
    # do a last sample
    reservoir = np.vstack(reservoir)
    if nreservoir > nsample: 
        idx = rs.choice(nreservoir, size=nsample, replace=False)
        reservoir = reservoir[idx]
    return reservoir    

In [None]:
def matrix_source(): 
    for i in range(0, 1000, 10): 
        yield np.tile(np.arange(i, i + 10), (5, 1)).T
a = matrix_source()
print(next(a))
print(next(a))
print(next(a))

In [None]:
src = matrix_source()
reservoir_sampling(src, 100)

In [None]:
src = pd.read_csv("embedding_data/test_5k.csv", chunksize = 2000)
sample = reservoir_sampling(src, 100)
pd.DataFrame(sample, columns=['Summary','Text','combined','embedding'])

## Read saved index

In [152]:
import linecache
index = faiss.read_index("IVFPQ_index.bin")

In [172]:
# input is a single vector
k = 18
D, I = index.search(np.array([df.embedding[1111]], dtype='f'), k)
print(I)
print(D)

query_result = []
for idx in I[0]:
    result = linecache.getline("embedding_data/Reviews_embedding.csv", idx)
    query_result.append(result)
print(f"{len(query_result)} related string found, remove {len(query_result) - len(set(query_result))} duplicates.")
set(query_result)

[[  1111    545  26120  56634  67856  78647 113001 228749 331882 403311
  466531 497882 539534 549275 550362 202425  26015  56529  67751  78542
  112896 228644 331777 403206 466426 497777 549170 550257 221445    440
  539429  50687 316183 408759 273606 404116  74429  94437 218735 224468
  243893 279034 304633 305141 333850 348441 410819 422721 541639 559301
   24085  31098 277527 341603 344588 359668 413916 485629 547696  98778
  365785  25401  74440  94448 224479 243904 304644 305152 348452 410830
  422732 541650    233 559312 533560    538  26113  56627  67849  78640
  112994 228742 331875 403304 466524 497875 539527 549268 550355 545074
  279045 218746 333861 340264  78500 115386 397206 334667    546  26121
   56635  67857  78648 113002 228750 331883 403312 466532 497883 539535
  549276 550363 497424 456657 481763 539938   1103 374773 357968 257718
  444264 330056 231116 349713 559168 353042 218747  92745  74441  94449
  224480 243905 279046 304645 305153 333862 348453 410831 422733

{'"A delightful alternative to ""flavoring""","Read the ingredients - the three items used to flavor these are sea salt, tomato paste, and chipotle powder.  I would say the tomato is barely noticeable.  If you like smoky chipotle chili powder - this is a deeply enjoyable alternative to most other snacks on the market today.<br /><br />These are a snack I want to keep available at all times.  It is a crunchy (very, very crunchy to be clear), satisfying, noticeably spicy, nutritious alternative to basically any other snack.  I was surprised by the spice (as in spicy spice).  This definitely is not a flavor that has been tamed - and I am grateful for Mary\'s boldness.","Summary: A delightful alternative to ""flavoring""|Text: Read the ingredients - the three items used to flavor these are sea salt, tomato paste, and chipotle powder.  I would say the tomato is barely noticeable.  If you like smoky chipotle chili powder - this is a deeply enjoyable alternative to most other snacks on the ma

In [151]:
# query multiple vectors
k = 3
D, I = index.search(np.array(df.embedding[1000:1005].tolist(), dtype='f'), k)
print(I)
print(D)

[[  1000 423201  85220]
 [  1001  89772  14033]
 [  1002  66099 143502]
 [  1003 525351  57024]
 [  1004  81980 161510]]
[[0.3559034  0.6082137  0.6727124 ]
 [0.2738871  0.53682315 0.5505231 ]
 [0.27042317 0.50743157 0.50743157]
 [0.26771435 0.49733913 0.5190366 ]
 [0.28873777 0.5517103  0.6005175 ]]
