### Loading dataset

In [None]:
# ! pip install -U datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

In [None]:
train_dataset = dataset["train"]
len(train_dataset)

In [None]:
import random
num_rows = len(train_dataset)
random_indices = random.sample(range(num_rows), 1000)
random_rows = [train_dataset[idx] for idx in random_indices]

In [None]:
articles = [x["text"] for x in random_rows]

### Generating embeddings using Sentence Transformers

In [None]:
# ! pip install -U transformers nltk

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")


In [None]:
from tqdm import tqdm

result = []
for article in tqdm(articles):
    result.append(model.encode(article))


In [None]:
len(result)

In [None]:
result[0].shape

### Indexing data into FAISS

In [None]:
import numpy as np
np_result = np.array(result)

In [None]:
np_result.shape

In [None]:
import faiss
d = 768
index = faiss.IndexFlatL2(d)   # build the index


In [None]:
index.add(np_result)                  # add vectors to the index
print(index.ntotal)

### Querying data

In [None]:
articles[0]

In [None]:
q = "William"

In [None]:
xq = model.encode(q)

In [None]:
xq.shape

In [None]:
xq = xq.reshape(1, -1)

In [None]:
xq.shape

In [None]:
index.search(xq, 2)

In [None]:
articles[743]