### **Note:** This code has been optimized for Google Colab platform for the sake GPU utility.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Storing Embeddings in Vector DB for RAG

In [None]:
!pip install faiss-cpu langchain

In [3]:
import numpy as np
import pickle

# Load Embeddings .pkl File
path_to_embeddings = '/content/drive/MyDrive/GenAI-CSA/data/embedded/embeddings_mpnet.pkl'
with open(path_to_embeddings, 'rb') as file:
    embeddings = pickle.load(file)

# Extract Vectors
embedding_vectors = embeddings['embeddings']

# Convert to Array
embeddings_array = np.array(embedding_vectors, dtype=np.float32)

print(f"Embeddings numpy array shape: {embeddings_array.shape}")
print(f"Data type of embeddings: {embeddings_array.dtype}")

Embeddings numpy array shape: (501606, 768)
Data type of embeddings: float32


### FAISS Integration

In [4]:
import faiss

# FAISS Index using Cosine Similarity
faiss.normalize_L2(embeddings_array)
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings_array)

print(f"Number of embeddings in the FAISS index: {index.ntotal}")

Number of embeddings in the FAISS index: 501606


In [7]:
# Save the FAISS Index
faiss_path = '/content/drive/MyDrive/GenAI-CSA/data/embedded/faiss_index_mpnet.index'
faiss.write_index(index, faiss_path)

print(f"FAISS index saved at: {faiss_path}")

FAISS index saved at: /content/drive/MyDrive/GenAI-CSA/data/embedded/faiss_index_mpnet.index
