# Faiss and word2vec for fast semantic similarity search

In [None]:
# install dependencies
! pip install faiss-cpu
! pip install gensim

In [None]:
# download word2vec model (2.5GB)
! curl --remote-name-all https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/209{/word2vec_models.zip}
! unzip /work/word2vec_models.zip
! unzip -j /work/word2vec_models.zip '*READ*'

load the model

In [25]:
from gensim.models import KeyedVectors
vectors = KeyedVectors.load('./word2vec-isl/IGC_2021_lemmatized__350__13__9__5__0_05__1_vectors.kv')

store all the normalized data in a list, `xb` (for x database)

In [26]:
xb = vectors.get_normed_vectors()
xb.shape

(969714, 350)

initialize the query `xq`

In [93]:
import numpy as np
xq = vectors.get_vector('fingur', norm=True)
xq = np.reshape(xq, (1, -1)).astype('float32')

xq.shape

(1, 350)

In [73]:
import faiss

nlist = 128  # number of cells/clusters to partition data into
d = 350      # dimension of vectors
k = 10       # number of nearest neighbors to return

quantizer = faiss.IndexFlatIP(d)  # how the vectors will be stored/compared
index = faiss.IndexIVFFlat(quantizer, d, nlist)


In [74]:
index.train(xb)  # we must train the index to cluster into cells

# total words: 969714
# last stable at: 484857 or 0.5*969714
percent_of_total = 1*969_714

index.add(xb[:484857])

In [96]:
index.nprobe = 8  # how many nearest cells to search
D, I = index.search(xq, k)

In [98]:

# display results

for i in range(len(I[0])):
    distance = D[0][i] # small means most relevant
    word_index = I[0][i]
    try:
        results = vectors.most_similar(positive=[vectors[word_index]], topn=1)
        result_word = results[0][0]
        print(f"result: {result_word} \ndistance: {distance} \n")
    except: 
        print("something went wrong")

result: fingur 
distance: 0.0 

result: vísifingur 
distance: 0.609532356262207 

result: putti 
distance: 0.6576252579689026 

result: þumalfingur 
distance: 0.6648381352424622 

result: þumall 
distance: 0.7613661885261536 

result: langatöng 
distance: 0.7703074216842651 

result: tá 
distance: 0.8249093294143677 

result: handleggur 
distance: 0.8280805945396423 

result: baugfingur 
distance: 0.8759332895278931 

result: litlafingur 
distance: 0.8845452666282654 

