
<a href="https://colab.research.google.com/github/ruparee/rag-pipeline-tutorial-notebook/blob/main/rag-pipeline-tutorial-notebook.ipynb" target="_parent">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [None]:

import os
import warnings

# ✅ Detect environment (Google Colab vs Local)
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    from dotenv import load_dotenv
    load_dotenv()  # Load API keys from .env file
    HF_TOKEN = os.getenv("HF_TOKEN")

# ✅ Authenticate with Hugging Face Hub if a token is available
if HF_TOKEN:
    os.environ["HF_TOKEN"] = HF_TOKEN
    print("✅ Hugging Face authentication successful!")
else:
    warnings.warn(
        "⚠️ The secret `HF_TOKEN` is not set. To authenticate with Hugging Face Hub, "
        "create a token at https://huggingface.co/settings/tokens and set it in Colab secrets or a .env file."
    )


In [6]:
# ✅ Ensure all required packages are installed
!pip install --upgrade pinecone-client langchain-pinecone langchain sentence-transformers openai


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Collecting openai
  Downloading openai-1.63.0-py3-none-any.whl.metadata (27 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting pinecone<6.0.0,>=5.4.0 (from langchain-pinecone)
  Downloading pinecone-5.4.2-py3-none-any.whl.metadata (19 kB)
Collecting aiohttp<3.11,>=3.10 (from langchain-pinecone)
  Downloading aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain-pinecone)
  Downloading langchain_tests-0.3.12-py3-none-any.whl.m

In [7]:

import os
import subprocess

repo_url = "https://github.com/ruparee/rag-pipeline-tutorial.git"
repo_name = "rag-pipeline-tutorial"

# Check if the directory exists
if os.path.exists(repo_name):
    print(f"✅ Directory '{repo_name}' already exists. Checking for updates...")
    os.chdir(repo_name)  # Move into the repo directory
    try:
        subprocess.run(["git", "pull", "origin", "main"], check=True)
        print("✅ Repository updated successfully!")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Error pulling latest changes: {e}. Proceeding with existing files.")
    os.chdir("..")  # Move back to the original directory
else:
    print(f"✅ Cloning repository '{repo_name}'...")
    subprocess.run(["git", "clone", repo_url], check=True)
    print("✅ Repository cloned successfully!")


✅ Directory 'rag-pipeline-tutorial' already exists. Checking for updates...
✅ Repository updated successfully!



# 🔍 **RAG Pipeline with Pinecone & Sentence Transformers**
This notebook implements a **Retrieval-Augmented Generation (RAG) pipeline** using:
- **Google Colab's Secure Secret Management** (`userdata.get()`)
- **Pinecone for vector storage**
- **`sentence-transformers` for local embeddings**
- **Fixes for API limits, mismatched dimensions, and deletion protection**


In [8]:

# ✅ Access secret keys securely in Google Colab
from google.colab import userdata

PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# Ensure keys are set before proceeding
assert PINECONE_API_KEY, "Pinecone API Key is missing!"
assert OPENAI_API_KEY, "OpenAI API Key is missing!"

import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print("✅ API keys loaded securely!")


✅ API keys loaded securely!


In [9]:

# from pinecone import Pinecone

# # ✅ Initialize Pinecone client
# pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
# index_name = "rag-pipeline-tutorial"

# # ✅ Disable deletion protection before recreating the index
# try:
#     pc.configure_index(index_name, deletion_protection="disabled")
#     print(f"✅ Deletion protection disabled for index: {index_name}")
# except Exception as e:
#     print(f"⚠️ Warning: Could not disable deletion protection. Index may not exist yet. {e}")

# # ✅ Delete existing index if it exists
# existing_indexes = [index["name"] for index in pc.list_indexes()]
# if index_name in existing_indexes:
#     pc.delete_index(index_name)
#     print(f"✅ Index '{index_name}' deleted successfully.")
# else:
#     print(f"✅ No existing index found. Proceeding to create a new one.")


✅ Deletion protection disabled for index: rag-pipeline-tutorial
✅ Index 'rag-pipeline-tutorial' deleted successfully.


In [None]:

from pinecone import Pinecone

# ✅ Initialize Pinecone client
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_name = "rag-pipeline-tutorial"

# ✅ Define expected embedding dimension
EXPECTED_DIMENSION = 384  # Matches `all-MiniLM-L6-v2` model

# ✅ Check if the index exists and compare dimensions
existing_indexes = [index["name"] for index in pc.list_indexes()]
if index_name in existing_indexes:
    index_stats = pc.Index(index_name).describe_index_stats()
    current_dimension = index_stats.get("dimension", None)

    if current_dimension == EXPECTED_DIMENSION:
        print(f"✅ Index '{index_name}' already exists with correct dimension ({EXPECTED_DIMENSION}). Skipping deletion.")
    else:
        print(f"⚠️ Dimension mismatch! Expected {EXPECTED_DIMENSION}, but found {current_dimension}. Recreating index...")
        pc.delete_index(index_name)
        print(f"✅ Deleted index '{index_name}'. Proceeding with recreation.")
else:
    print(f"✅ No existing index found. Proceeding to create a new one.")

# ✅ Create Pinecone index only if necessary
if index_name not in existing_indexes or current_dimension != EXPECTED_DIMENSION:
    from pinecone import ServerlessSpec

    pc.create_index(
        name=index_name,
        dimension=EXPECTED_DIMENSION,  # Ensure it matches the model
        metric="euclidean",
        deletion_protection="enabled",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"✅ New Pinecone index '{index_name}' created with dimension {EXPECTED_DIMENSION}.")


In [14]:

from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.base import Embeddings

# ✅ Load a local embedding model (384D)
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Wrapper to ensure compatibility with LangChain
class LocalEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return embeddings_model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return embeddings_model.encode([text], convert_to_numpy=True).tolist()

embeddings = LocalEmbeddings()

print("✅ Local embeddings model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Local embeddings model loaded successfully!


In [21]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# ✅ Example documents (Replace with your actual dataset)
docs = [
    "Vector databases store high-dimensional vectors used for semantic search.",
    "Pinecone is a serverless vector database optimized for AI applications.",
    "Large Language Models (LLMs) use vector databases to improve retrieval accuracy."
]

# ✅ Convert docs into a list of `Document` objects
documents = [Document(page_content=doc) for doc in docs]


# ✅ Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
# ✅ Use the correct method to split documents
split_docs = text_splitter.split_documents(documents)


print(f"✅ Loaded and split {len(split_docs)} document chunks!")


✅ Loaded and split 3 document chunks!


In [22]:

# ✅ Store document vectors in Pinecone
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)
print("✅ Documents successfully stored in Pinecone!")


✅ Documents successfully stored in Pinecone!


In [24]:

# ✅ Run a similarity search query
query = "What is a vector database?"
results = vectorstore.similarity_search(query)

# ✅ Print retrieved results
for i, doc in enumerate(results):
    print(f"Result {i+1}: {doc.page_content}")


Result 1: Vector databases store high-dimensional vectors used for semantic search.
Result 2: Large Language Models (LLMs) use vector databases to improve retrieval accuracy.
Result 3: Pinecone is a serverless vector database optimized for AI applications.



## 🚀 **Next Enhancements**
1. **Improve retrieval quality** – Fine-tune embeddings for domain-specific knowledge.
2. **Optimize query performance** – Implement vector caching strategies.
3. **Enhance batch processing** – Improve bulk vector updates in Pinecone.
4. **Implement Hybrid Search** – Combine **Vector + Keyword Search** for better accuracy.
5. **Use Re-Ranking models** – Apply `cross-encoder` to improve ranking.
6. **Expand Data Sources** – Integrate a more diverse document set.
7. **Integrate a Chatbot** – Build an AI chatbot using the Pinecone knowledge base.

🔹 This notebook **fully integrates fixes for API limits, mismatched dimensions, deletion protection, and retrieval optimizations**.  
💡 Feel free to experiment and extend the pipeline with the listed enhancements! 🎯  
