<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/nlp-for-vector-similarity-search/01_introduction_to_faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Introduction to Faiss

**Reference**

[Introduction to Facebook AI Similarity Search (Faiss)](https://www.pinecone.io/learn/faiss-tutorial/)

##Setup

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

In [None]:
import requests
from io import StringIO
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer

import faiss

##Building Vectors

In [None]:
response = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')

# create dataframe
data = pd.read_csv(StringIO(response.text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [None]:
# we take all samples from both sentence A and B
sentences = data["sentence_A"].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [None]:
sentence_b = data["sentence_B"].tolist()
sentences.extend(sentence_b)

len(set(sentences))

4802

In [None]:
#  let's pull in a few more similar datasets
urls = [
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
  'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

# each of these dataset have the same structure, so we loop through each 
for url in urls:
  response = requests.get(url)

  # create dataframe
  data = pd.read_csv(StringIO(response.text), sep='\t', header=None, error_bad_lines=False)
  # add to columns 1 and 2 to sentences list
  sentences.extend(data[1].tolist())
  sentences.extend(data[2].tolist())

In [None]:
len(set(sentences))

14505

##Building Dense Vectors

In [None]:
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [None]:
# initialize sentence transformer model
model = SentenceTransformer("bert-base-nli-mean-tokens")

In [None]:
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

##Euclidean(L2) distance 

In [None]:
# vector dimensionality
d = sentence_embeddings.shape[1]
d

768

In [None]:
# initialize IndexFlatL2 index
index = faiss.IndexFlatL2(d)
print(index.is_trained)

True


##Query Sentence

In [None]:
# load our embeddings
index.add(sentence_embeddings)
index.ntotal

14504

In [None]:
# Then search given a query xq and number of nearest neigbors to return k
k = 4
xq = model.encode(["Someone sprints with a football"])

In [None]:
%%time

# search
D, I = index.search(xq, k)
print(I)

[[ 3853  3412 10113  4266]]
CPU times: user 7.81 ms, sys: 22 µs, total: 7.83 ms
Wall time: 9.2 ms


In [None]:
len(data[1])

1500

In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

## Vector Reconstruction

In [None]:
# we have 4 vectors to return (k) - so we initialize a zero array to hold them
vecs = np.zeros((k, d))

# then iterate through each ID from I and add the reconstructed vector to our zero-array
for i, val in enumerate(I[0].tolist()):
  vecs[i, :] = index.reconstruct(val)
  
vecs.shape

(4, 768)

In [None]:
vecs[0][:100]

array([ 0.01627023,  0.22325909, -0.15037383, -0.30747274, -0.27122426,
       -0.105932  , -0.06460915,  0.0473821 , -0.73349071, -0.37657702,
       -0.76762801,  0.16902868,  0.53107685,  0.51176631,  1.14415824,
       -0.08562893, -0.67240089, -0.96637088,  0.02545471, -0.2155983 ,
       -1.25656641, -0.82982188, -0.09824977, -0.21850872,  0.5061025 ,
        0.10527933,  0.50396907,  0.65242976, -1.39458668,  0.65847468,
       -0.21525355, -0.22487433,  0.81818348,  0.08464285, -0.76141697,
       -0.28928307, -0.0982579 , -0.73046142,  0.07855832, -0.84354609,
       -0.59242088,  0.7747137 , -1.20920563, -0.22757971, -1.30733621,
       -0.23081483, -1.31322527,  0.01629104, -0.97285467,  0.19308169,
        0.47424543,  1.1892091 , -1.96741259, -0.70061135, -0.29638749,
        0.6053372 ,  0.6240744 , -0.70340365, -0.86754185,  0.17673104,
       -0.19170581, -0.02951968,  0.22623521, -0.16695444, -0.80402541,
       -0.45918944,  0.69675523, -0.249282  , -1.01478708, -0.92

##Partitioning The Index

In [None]:
# how many cells
n_list = 50
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, n_list)

In [None]:
# train index on data before adding any data to the index
index.is_trained

False

In [None]:
index.train(sentence_embeddings)
# check if index is now trained
index.is_trained

True

In [None]:
index.add(sentence_embeddings)
# number of embeddings indexed
index.ntotal

14504

In [None]:
# Let’s search again using the same indexed sentence embeddings and the same query vector
%%time

# search
D, I = index.search(xq, k)
print(I)

[[ 3853  3412 10113  4266]]
CPU times: user 589 µs, sys: 16 µs, total: 605 µs
Wall time: 833 µs


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

In [None]:
# We can increase the number of nearby cells to search too with nprobe.
index.nprobe = 10

In [None]:
%%time

# search
D, I = index.search(xq, k)
print(I)

[[ 3853  3412 10113  4266]]
CPU times: user 1.08 ms, sys: 3 µs, total: 1.08 ms
Wall time: 1.09 ms


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

##Vector Reconstruction

In [None]:
# now, to reconstruct the vectors, we must first create these direct mappings
index.make_direct_map()

In [None]:
index.reconstruct(11122)[:100]

array([ 1.2838421 ,  0.21651636,  0.9712451 ,  0.31156752,  0.28217337,
        0.91760635,  1.5379874 ,  0.18022525, -0.87787133, -0.02692633,
       -1.3939755 , -0.02931822, -0.0910926 ,  0.8252405 ,  1.1967825 ,
        0.46976212, -0.4116232 , -0.44104576,  0.00642029,  0.14851105,
       -0.43181732,  0.53935724,  0.6806572 , -1.2024508 , -0.5175477 ,
       -0.87696105, -0.14817108, -0.24578056, -1.1521579 ,  0.2925653 ,
        0.1566306 ,  0.9666366 ,  0.48437908, -0.14077239,  0.6266316 ,
       -0.28244448, -0.12196147, -1.0980004 ,  0.39501837, -0.4538445 ,
        0.03117619,  0.350108  ,  0.088641  , -0.04181439,  0.49622074,
       -0.02361662,  2.3334348 , -0.49966168, -0.1949345 , -0.37520537,
        0.02379034, -0.2796866 ,  1.5051224 ,  0.75192493, -0.23107927,
       -0.4343509 ,  0.41142032, -0.58381236,  0.6106839 ,  0.5567988 ,
        0.39692488, -0.09096396,  0.01875396,  0.06579515, -0.54263747,
       -0.29689386, -0.2615427 , -0.36894163,  0.02824443, -0.37

##Quantization

In [None]:
# number of centroid IDs in final compressed vectors
m = 8

# number of bits in each centroid
bits = 8

# we keep the same L2 distance flat index
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, n_list, m, bits)

In [None]:
index.is_trained

True

In [None]:
index.train(sentence_embeddings)

In [None]:
index.add(sentence_embeddings)

In [None]:
# now we’re ready to begin searching using our new index
index.nprobe = 10

In [None]:
%%time

D, I = index.search(xq, k)
print(I)

[[ 3853 18357  3412 17916]]
CPU times: user 3.63 ms, sys: 46 µs, total: 3.68 ms
Wall time: 2.66 ms


In [None]:
[f'{i}: {sentences[i]}' for i in [3853, 3412]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field']