In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_Phi2')

print(f"HuggingFace Token Loaded.")

HuggingFace Token Loaded.


# Integrating FAISS Index with LangChain for RAG

In [None]:
!pip install langchain langchain-community langchain-huggingface faiss-cpu sentence-transformers

In [4]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.docstore.in_memory import InMemoryDocstore
import faiss
import numpy as np
import pickle

In [5]:
# Load FAISS Index
faiss_index_path = '/content/drive/MyDrive/GenAI-CSA/data/embedded/faiss_index_mpnet.index'
faiss_index = faiss.read_index(faiss_index_path)

# Load Embeddings
path_to_embeddings = '/content/drive/MyDrive/GenAI-CSA/data/embedded/embeddings_mpnet.pkl'
with open(path_to_embeddings, 'rb') as file:
    embeddings = pickle.load(file)

print(f"Loaded FAISS index with {faiss_index.ntotal} embeddings.")

Loaded FAISS index with 501606 embeddings.


### Rebuilding LangChain FAISS Object:

In [6]:
# Extract Documents
documents = embeddings['documents']

# Convert into LangChain Document Objects
langchain_docs = [Document(page_content=doc['page_content'], metadata=doc['metadata']) for doc in documents]

# Create Docstore & Mapping for LangChain
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(langchain_docs)})
index_to_docstore_id = {i: str(i) for i in range(len(langchain_docs))}

# Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-mpnet-base-v2')

# Reconstruct LangChain FAISS Object
faiss_store = FAISS(
    embedding_function=embedding_model,
    index=faiss_index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

print(f"LangChain FAISS store ready with {faiss_store.index.ntotal} embeddings.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


LangChain FAISS store ready with 501606 embeddings.


### Test Semantic Search w/ LangChain:

In [7]:
# Test Query
query = "Most popular offer."
retrieved_docs = faiss_store.similarity_search(query, k=5)


for i, doc in enumerate(retrieved_docs, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")

Document 1:
Content: user in multan with a postpaid plan. currently subscribed to offer: offer 49 (offer id: o046). usage details: data browsing allowance of 4990mb, social data allowance of 1813mb, 681 sms, 181 on-net minutes, and 185 off-net minutes. recent transaction on 2024-06-08 11:20:14 with amount charged: 19 units. resource type: data (value: 15). customer support ticket (id: t40586) logged on 2024-01-19 17:15:03 under category: billing. issue description: issue reported under billing category. resolution provided on 2024-01-22 01:15:03: resolved with detailed explanation for billing category.
Metadata: {'city': 'Multan', 'user_type': 'Postpaid', 'offer': 'Offer 49', 'data_allowance': 4990, 'sms_allowance': 681, 'voice_on_net': 181, 'voice_off_net': 185, 'data_social_allowance': 1813, 'amount': 244, 'resource_type': 'Data', 'category': 'Billing'}
Document 2:
Content: user in multan with a postpaid plan. currently subscribed to offer: offer 49 (offer id: o046). usage details: d

In [8]:
# Save LangChain Compatible FAISS Object
faiss_store.save_local('/content/drive/MyDrive/GenAI-CSA/data/embedded/langchain_faiss_index')