In [4]:
## FAISS : Facebook AI Similarity Search 
## Semantic search and not keyword search
## Use case : To search embeddings of multimedia docs similar to each other

## Change log :
## Chunks to Embeddings conversion 
## Storing in vector db (here viz.)-> FAISS Index

!pip install faiss-cpu
!pip install sentence_transformers



In [5]:
import pandas as pd
import numpy as np
import faiss

In [51]:
pd.set_option('display.max_colwidth', 100)

In [52]:
df = pd.read_csv('text.csv')

In [53]:
df.shape

(21, 2)

In [54]:
df.head()

Unnamed: 0,text,category
0,Adventure travel destinations for thrill-seekers looking for a challenge,Travel
1,The benefits of mindfulness meditation on stress reduction and focus,Health
2,Sustainable and cruelty-free beauty products gaining popularity,Fashion
3,Exclusive film screening at the downtown cinema this Friday,Event
4,International food festival featuring diverse cuisines in the city,Event


In [55]:
## Create source embeddings for the text column
from sentence_transformers import SentenceTransformer

In [56]:
encoder = SentenceTransformer("all-mpnet-base-v2") ## converting the text into a vector
vector_result = encoder.encode(df.text)
vector_result.shape

(21, 768)

In [57]:
vector_result

array([[ 1.15750888e-02, -1.80600379e-02, -2.18630992e-02, ...,
        -4.16774310e-05,  3.78705114e-02,  6.16514403e-03],
       [-2.10229512e-02, -1.74044780e-02, -3.11093777e-02, ...,
        -5.82218394e-02, -1.83688216e-02, -2.09694225e-02],
       [ 6.44548982e-02,  8.36133659e-02, -2.35022455e-02, ...,
        -1.82379428e-02, -5.66119514e-02,  1.69566565e-03],
       ...,
       [ 3.85678038e-02,  1.22293029e-02, -2.70051006e-02, ...,
         2.79318970e-02, -1.19512025e-02,  6.36785524e-03],
       [ 1.24867223e-02,  3.78133804e-02, -2.55502500e-02, ...,
        -1.09511921e-02, -4.79510613e-03, -2.69153956e-02],
       [ 1.15750888e-02, -1.80600379e-02, -2.18630992e-02, ...,
        -4.16774310e-05,  3.78705114e-02,  6.16514403e-03]], dtype=float32)

In [58]:
## size of each vector
size = vector_result.shape[1]
size

768

In [59]:
## Create FAISS Index DB for vector storage

index_db = faiss.IndexFlatL2(size)

In [60]:
## Normalize the source vectors (as we are using L2 distance to measure similarity) and add to the index

index_db.add(vector_result)
index_db

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000026A430FB510> >

In [70]:
## Encode search text using same encorder and normalize the output vector

query = "I like attending concerts."
## query = "Do you like eating vegetables?"
## query = "Wow , these new shoes!"

vector = encoder.encode(query)
vector.shape

(768,)

In [71]:
vector

array([-8.16788748e-02,  7.82953650e-02, -2.43047215e-02,  3.85544673e-02,
        6.80918479e-03,  1.08801266e-02, -6.76667243e-02, -1.83000192e-02,
        1.14926146e-02,  7.59412944e-02, -8.45118538e-02,  3.53309587e-02,
       -1.68744493e-02,  2.85406057e-02,  4.17841859e-02, -6.03594892e-02,
        1.89049542e-02, -4.22759503e-02, -1.11416215e-02,  3.02081481e-02,
       -5.56001924e-02,  1.24355238e-02, -4.11903635e-02, -1.75981950e-02,
        6.35768333e-03, -2.17955448e-02, -2.32358160e-03, -4.14825492e-02,
        4.18478325e-02,  9.60389376e-02, -4.84037511e-02, -6.48705736e-02,
        3.51038501e-02, -1.53316138e-02,  1.15147384e-06,  4.02133167e-02,
        3.24844476e-03, -3.35100405e-02,  7.76793361e-02, -2.78055761e-02,
        3.35152075e-03,  5.42000197e-02, -2.90063862e-02,  2.88961995e-02,
        3.06382403e-03, -1.60840657e-02, -6.74625253e-03, -1.63166262e-02,
       -4.98896502e-02,  1.52337244e-02, -1.79854799e-02, -1.51053034e-02,
       -3.87503840e-02, -

In [72]:
## Converted 1D array to 2D array
m_vec = np.array(vector).reshape(1,-1)
m_vec.shape

(1, 768)

In [73]:
m_vec

array([[-8.16788748e-02,  7.82953650e-02, -2.43047215e-02,
         3.85544673e-02,  6.80918479e-03,  1.08801266e-02,
        -6.76667243e-02, -1.83000192e-02,  1.14926146e-02,
         7.59412944e-02, -8.45118538e-02,  3.53309587e-02,
        -1.68744493e-02,  2.85406057e-02,  4.17841859e-02,
        -6.03594892e-02,  1.89049542e-02, -4.22759503e-02,
        -1.11416215e-02,  3.02081481e-02, -5.56001924e-02,
         1.24355238e-02, -4.11903635e-02, -1.75981950e-02,
         6.35768333e-03, -2.17955448e-02, -2.32358160e-03,
        -4.14825492e-02,  4.18478325e-02,  9.60389376e-02,
        -4.84037511e-02, -6.48705736e-02,  3.51038501e-02,
        -1.53316138e-02,  1.15147384e-06,  4.02133167e-02,
         3.24844476e-03, -3.35100405e-02,  7.76793361e-02,
        -2.78055761e-02,  3.35152075e-03,  5.42000197e-02,
        -2.90063862e-02,  2.88961995e-02,  3.06382403e-03,
        -1.60840657e-02, -6.74625253e-03, -1.63166262e-02,
        -4.98896502e-02,  1.52337244e-02, -1.79854799e-0

In [74]:
## Search for similar vector in the FAISS index created

dist,idx = index_db.search(m_vec, k=4)
dist,idx
## return a tuple : distance , index in main df

(array([[1.4658836, 1.4906479, 1.4986327, 1.512398 ]], dtype=float32),
 array([[15,  4,  9, 16]], dtype=int64))

In [75]:
idx.tolist()

[[15, 4, 9, 16]]

In [76]:
idx_rows = idx.tolist()[0]
idx_rows

[15, 4, 9, 16]

In [77]:
df.loc[idx_rows] ##locate

Unnamed: 0,text,category
15,Jazz night featuring local musicians at the waterfront venue,Event
4,International food festival featuring diverse cuisines in the city,Event
9,Cultural festival celebrating diversity and traditions in the community,Event
16,Weekend getaway ideas for a quick escape from the hustle and bustle,Travel


In [78]:
query

'I like attending concerts.'