In [None]:
# ! pip install faiss-cpu
# ! pip install sentence-transformers

In [1]:
import pandas as pd

data = [
    ['What is the weather like today?', 'general'],
    ['Can you provide the latest stock market updates?', 'finance'],
    ['Recommend a good Italian restaurant nearby', 'food'],
    ['How do I reset my password?', 'tech support'],
    ['Tell me a joke', 'entertainment'],
    ['What are the symptoms of a flu?', 'health'],
    ['Book a flight to New York', 'travel'],
    ['How to make a chocolate cake?', 'cooking'],
    ['Whats the score in the football game?', 'sports'],
    ['Im feeling happy today', 'personal emotion']
]

df = pd.DataFrame(data, columns=['text', 'category'])

In [2]:
df.head()

Unnamed: 0,text,category
0,What is the weather like today?,general
1,Can you provide the latest stock market updates?,finance
2,Recommend a good Italian restaurant nearby,food
3,How do I reset my password?,tech support
4,Tell me a joke,entertainment


In [3]:
from sentence_transformers import SentenceTransformer

text = df['text']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2") # bert-base-nli-mean-tokens
embeddings = encoder.encode(text)

In [4]:
embeddings

array([[-0.24353878, -0.33994555, -0.03588928, ...,  0.01973329,
        -0.1958899 ,  0.18529022],
       [-0.27030158,  0.1076827 , -0.18158962, ...,  0.08727878,
        -0.19526926, -0.09498555],
       [-0.21940392,  0.14821872, -0.00985209, ...,  0.02815687,
         0.12007379,  0.00619158],
       ...,
       [ 0.0703835 , -0.40063456, -0.03175796, ...,  0.08103655,
         0.0760937 ,  0.1977389 ],
       [-0.11239459, -0.15254383, -0.1391996 , ...,  0.19001462,
         0.25385964, -0.2091582 ],
       [-0.1684444 , -0.2986751 , -0.02637996, ...,  0.08791838,
         0.17442603, -0.10776687]], dtype=float32)

In [5]:
embeddings.shape

(10, 768)

In [6]:
import faiss
vector_dimensions = embeddings.shape[1]

index = faiss.IndexFlatL2(vector_dimensions)
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [10]:
import numpy as np

search_text = '≈y'
search_vector = encoder.encode(search_text)
new_vector = np.array([search_vector])
faiss.normalize_L2(new_vector)

In [11]:
distances,ann = index.search(new_vector,k=4)
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
df_merged = pd.merge(results,df,left_on='ann',right_index=True)

In [12]:
df_merged.head()

Unnamed: 0,distances,ann,text,category
0,0.781403,4,Tell me a joke,entertainment
1,1.528978,9,Im feeling happy today,personal emotion
2,1.593853,0,What is the weather like today?,general
3,1.624735,8,Whats the score in the football game?,sports
