<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/nlp-for-vector-similarity-search/02_nearest_neighbor_indexes_for_similarity_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Nearest Neighbor Indexes for Similarity Search

**Reference**

[Nearest Neighbor Indexes for Similarity Search](https://www.pinecone.io/learn/vector-indexes/)

##Setup

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

In [2]:
import shutil
import urllib.request as request
from contextlib import closing
from io import StringIO
import pandas as pd
import numpy as np
import tarfile

from sentence_transformers import SentenceTransformer

import faiss

##Dataset

In [3]:
# first we download the Sift1M dataset
with closing(request.urlopen("ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz")) as res:
  with open("sift.tar.gz", "wb") as f:
    shutil.copyfileobj(res, f)

In [4]:
# download leaves us with a tar.gz file, we unzip it
tar = tarfile.open("sift.tar.gz", "r:gz")
tar.extractall()

In [5]:
# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
  a = np.fromfile(fp, "int32")
  d = a[0]
  return a.reshape(-1, d + 1)[:, 1:].copy().view("float32")

In [6]:
#  data we will search through
xb = read_fvecs("./sift/sift_base.fvecs")  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [7]:
xq.shape

(1, 128)

In [8]:
xb.shape

(1000000, 128)

In [9]:
xq

array([[  1.,   3.,  11., 110.,  62.,  22.,   4.,   0.,  43.,  21.,  22.,
         18.,   6.,  28.,  64.,   9.,  11.,   1.,   0.,   0.,   1.,  40.,
        101.,  21.,  20.,   2.,   4.,   2.,   2.,   9.,  18.,  35.,   1.,
          1.,   7.,  25., 108., 116.,  63.,   2.,   0.,   0.,  11.,  74.,
         40., 101., 116.,   3.,  33.,   1.,   1.,  11.,  14.,  18., 116.,
        116.,  68.,  12.,   5.,   4.,   2.,   2.,   9., 102.,  17.,   3.,
         10.,  18.,   8.,  15.,  67.,  63.,  15.,   0.,  14., 116.,  80.,
          0.,   2.,  22.,  96.,  37.,  28.,  88.,  43.,   1.,   4.,  18.,
        116.,  51.,   5.,  11.,  32.,  14.,   8.,  23.,  44.,  17.,  12.,
          9.,   0.,   0.,  19.,  37.,  85.,  18.,  16., 104.,  22.,   6.,
          2.,  26.,  12.,  58.,  67.,  82.,  25.,  12.,   2.,   2.,  25.,
         18.,   8.,   2.,  19.,  42.,  48.,  11.]], dtype=float32)

##Flat Index

In [10]:
# dimensionality of Sift1M data
d = 128
# number of nearest neighbors to return
k = 10

In [11]:
# initialize IndexFlatL2 index
index = faiss.IndexFlatIP(d)
index.add(xb)

In [12]:
%%time

# search
D, I = index.search(xq, k)
print(I)

[[932085 934876 561813 708177 706771 695756 435345 701258 872728 455537]]
CPU times: user 56.7 ms, sys: 0 ns, total: 56.7 ms
Wall time: 59.3 ms


In [None]:
# [f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

## Locality Sensitive Hashing

In [13]:
# resolution of bucketed vectors
nbits = d * 4

# initialize index and add vectors
index = faiss.IndexLSH(d, nbits)
index.add(xb)

# search
D, I = index.search(xq, k)

In [14]:
print(I)

[[435345 931632 708177 813701 934876 455537 932085 561813 248185 361496]]


##HNSW Implementation