In [1]:
# Importing dependecies
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


#### Vector Search Setup

In [2]:
# expand the view so we can read the full text easily
pd.set_option('display.max_colwidth', 100)

# load your dataset and check the dimensions to ensure everything imported correctly
df = pd.read_csv("sample_text.csv")
print(f"Dataset loaded with {df.shape[0]} rows.")

Dataset loaded with 8 rows.


#### Convert Text into Numerical Vectors

In [3]:
# load a pre-trained model that understands general English meaning
encoder = SentenceTransformer("all-mpnet-base-v2")

# transform the entire 'text' column into high-dimensional vectors
vectors = encoder.encode(df.text)

# capture the 'dimension' (the length of the vector) which is 768 for this model
dim = vectors.shape[1]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-mpnet-base-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  38%|###8      | 168M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Build and Populate the Search Index

In [4]:
# initialize a flat index which uses L2 (Euclidean) distance to measure similarity
index = faiss.IndexFlatL2(dim)

# add our encoded text vectors into the searchable index
index.add(vectors)

In [6]:
# check how many items are actually in the index
print(f"Total vectors in index: {index.ntotal}")

# check if the index is "trained" (IndexFlatL2 is always trained by default)
print(f"Is index trained: {index.is_trained}")

# test a search for the top 2 closest matches to your query
distances, indices = index.search(svec, k=2)

# display the results clearly
print("\n--- Search Results ---")
print(f"Nearest neighbor indices: {indices[0]}")
print(f"Distances (L2):           {distances[0]}")

Total vectors in index: 8
Is index trained: True

--- Search Results ---
Nearest neighbor indices: [3 2]
Distances (L2):           [1.3844836 1.4039094]


#### Prepare the Search Query

In [5]:
# define what we are looking for
search_query = "I want to buy a polo t-shirt"

# encode the query and reshape it so FAISS recognizes it as a single search entry
vec = encoder.encode(search_query)
svec = np.array(vec).reshape(1, -1)

In [7]:
# check that the vector has 768 dimensions (the model's standard size)
print(f"Query Vector Shape: {svec.shape}")

# peek at the first 5 numbers of the vector to see the "math" behind your text
print(f"First 5 vector values: {svec[0][:5]}")

Query Vector Shape: (1, 768)
First 5 vector values: [ 0.01038829  0.02786865 -0.01186188  0.01813272  0.00121983]


#### Search for similar vector in the FAISS index created

In [8]:
# Use the index to find the 2 closest matches to our search query
distances, indices = index.search(svec, k=2)

# Convert the results into a simple list of row numbers
row_indices = indices.tolist()[0]

# Display the actual rows from the dataframe that match those numbers
print(f"Results for: '{search_query}'")
df.loc[row_indices]

Results for: 'I want to buy a polo t-shirt'


Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion
