In [1]:

# ‚úÖ Detect environment (Google Colab vs Local)
import sys
import os

# Check if running in Colab
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    print("‚úÖ Running in Google Colab")
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
else:
    print("‚úÖ Running in Local Environment")
    from dotenv import load_dotenv
    load_dotenv()  # Load API keys from .env file
    HF_TOKEN = userdata.get('HF_TOKEN')
    PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Ensure API keys are set
assert PINECONE_API_KEY, "‚ùå Pinecone API Key is missing! Set it in .env for local or Colab secrets."
assert OPENAI_API_KEY, "‚ùå OpenAI API Key is missing! Set it in .env for local or Colab secrets."

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# ‚úÖ Authenticate with Hugging Face Hub if a token is available
if HF_TOKEN:
    os.environ["HF_TOKEN"] = HF_TOKEN
    print("‚úÖ Hugging Face authentication successful!")
else:
    warnings.warn(
        "‚ö†Ô∏è The secret `HF_TOKEN` is not set. To authenticate with Hugging Face Hub, "
        "create a token at https://huggingface.co/settings/tokens and set it in Colab secrets or a .env file."
    )

print("‚úÖ API keys loaded successfully!")



‚úÖ Running in Google Colab
‚úÖ Hugging Face authentication successful!
‚úÖ API keys loaded successfully!


In [2]:

# ‚úÖ Install required packages (only if missing)
try:
    import pinecone
    import langchain_pinecone
    import langchain
    import sentence_transformers
    import openai
except ImportError:
    print("üîÑ Installing required packages...")
    !pip install --upgrade pinecone-client langchain-pinecone langchain sentence-transformers openai dotenv

print("‚úÖ All dependencies are installed!")


‚úÖ All dependencies are installed!



<a href="https://colab.research.google.com/github/ruparee/rag-pipeline-tutorial-notebook/blob/main/rag-pipeline-tutorial-notebook.ipynb" target="_parent">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [3]:
# ‚úÖ Ensure all required packages are installed
!pip install --upgrade pinecone-client langchain-pinecone langchain sentence-transformers openai


Collecting langchain-pinecone
  Using cached langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Collecting pinecone<6.0.0,>=5.4.0 (from langchain-pinecone)
  Using cached pinecone-5.4.2-py3-none-any.whl.metadata (19 kB)
Collecting aiohttp<3.11,>=3.10 (from langchain-pinecone)
  Using cached aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain-pinecone)
  Using cached langchain_tests-0.3.12-py3-none-any.whl.metadata (3.2 kB)
Collecting pytest-asyncio<1,>=0.20 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Using cached pytest_asyncio-0.25.3-py3-none-any.whl.metadata (3.9 kB)
Collecting syrupy<5,>=4 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Using cached syrupy-4.8.1-py3-none-any.whl.metadata (36 kB)
Collecting pytest-socket<1,>=0.6.0 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Using cached pytest_socket-0.7.0-py3-none-any.whl.metadata (6.7 k

In [4]:

import os
import subprocess

repo_url = "https://github.com/ruparee/rag-pipeline-tutorial.git"
repo_name = "rag-pipeline-tutorial"

# Check if the directory exists
if os.path.exists(repo_name):
    print(f"‚úÖ Directory '{repo_name}' already exists. Checking for updates...")
    os.chdir(repo_name)  # Move into the repo directory
    try:
        subprocess.run(["git", "pull", "origin", "main"], check=True)
        print("‚úÖ Repository updated successfully!")
    except subprocess.CalledProcessError as e:
        print(f"‚ö†Ô∏è Error pulling latest changes: {e}. Proceeding with existing files.")
    os.chdir("..")  # Move back to the original directory
else:
    print(f"‚úÖ Cloning repository '{repo_name}'...")
    subprocess.run(["git", "clone", repo_url], check=True)
    print("‚úÖ Repository cloned successfully!")


‚úÖ Directory 'rag-pipeline-tutorial' already exists. Checking for updates...
‚úÖ Repository updated successfully!



# üîç **RAG Pipeline with Pinecone & Sentence Transformers**
This notebook implements a **Retrieval-Augmented Generation (RAG) pipeline** using:
- **Google Colab's Secure Secret Management** (`userdata.get()`)
- **Pinecone for vector storage**
- **`sentence-transformers` for local embeddings**
- **Fixes for API limits, mismatched dimensions, and deletion protection**


In [5]:

# ‚úÖ Access secret keys securely in Google Colab
from google.colab import userdata

PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# Ensure keys are set before proceeding
assert PINECONE_API_KEY, "Pinecone API Key is missing!"
assert OPENAI_API_KEY, "OpenAI API Key is missing!"

import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print("‚úÖ API keys loaded securely!")


‚úÖ API keys loaded securely!


In [6]:

# from pinecone import Pinecone

# # ‚úÖ Initialize Pinecone client
# pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
# index_name = "rag-pipeline-tutorial"

# # ‚úÖ Disable deletion protection before recreating the index
# try:
#     pc.configure_index(index_name, deletion_protection="disabled")
#     print(f"‚úÖ Deletion protection disabled for index: {index_name}")
# except Exception as e:
#     print(f"‚ö†Ô∏è Warning: Could not disable deletion protection. Index may not exist yet. {e}")

# # ‚úÖ Delete existing index if it exists
# existing_indexes = [index["name"] for index in pc.list_indexes()]
# if index_name in existing_indexes:
#     pc.delete_index(index_name)
#     print(f"‚úÖ Index '{index_name}' deleted successfully.")
# else:
#     print(f"‚úÖ No existing index found. Proceeding to create a new one.")


In [7]:
# # ‚úÖ Create a new Pinecone index with the correct dimension (384 for local embeddings)
# from pinecone import ServerlessSpec

# # ‚úÖ Create a new Pinecone index with the correct dimension (384 for local embeddings)
# pc.create_index(
#     name=index_name,
#     dimension=384,  # Matches `all-MiniLM-L6-v2` model
#     metric="euclidean",
#     deletion_protection="enabled",  # Re-enable if needed
#         spec=ServerlessSpec(  # Correcting the spec definition
#         cloud="aws",
#         region="us-east-1"
#     )
# )
# print(f"‚úÖ New Pinecone index '{index_name}' created with dimension 384.")


from pinecone import Pinecone

# ‚úÖ Initialize Pinecone client
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_name = "rag-pipeline-tutorial"

# ‚úÖ Define expected embedding dimension
EXPECTED_DIMENSION = 384  # Matches `all-MiniLM-L6-v2` model

# ‚úÖ Check if the index exists and compare dimensions
existing_indexes = [index["name"] for index in pc.list_indexes()]
if index_name in existing_indexes:
    index_stats = pc.Index(index_name).describe_index_stats()
    current_dimension = index_stats.get("dimension", None)

    if current_dimension == EXPECTED_DIMENSION:
        print(f"‚úÖ Index '{index_name}' already exists with correct dimension ({EXPECTED_DIMENSION}). Skipping deletion.")
    else:
        print(f"‚ö†Ô∏è Dimension mismatch! Expected {EXPECTED_DIMENSION}, but found {current_dimension}. Recreating index...")
        pc.delete_index(index_name)
        print(f"‚úÖ Deleted index '{index_name}'. Proceeding with recreation.")
else:
    print(f"‚úÖ No existing index found. Proceeding to create a new one.")

# ‚úÖ Create Pinecone index only if necessary
if index_name not in existing_indexes or current_dimension != EXPECTED_DIMENSION:
    from pinecone import ServerlessSpec

    pc.create_index(
        name=index_name,
        dimension=EXPECTED_DIMENSION,  # Ensure it matches the model
        metric="euclidean",
        deletion_protection="enabled",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"‚úÖ New Pinecone index '{index_name}' created with dimension {EXPECTED_DIMENSION}.")


‚úÖ Index 'rag-pipeline-tutorial' already exists with correct dimension (384). Skipping deletion.


In [8]:

from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.base import Embeddings

# ‚úÖ Load a local embedding model (384D)
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")

# ‚úÖ Wrapper to ensure compatibility with LangChain
class LocalEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return embeddings_model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return embeddings_model.encode([text], convert_to_numpy=True).tolist()

embeddings = LocalEmbeddings()

print("‚úÖ Local embeddings model loaded successfully!")


‚úÖ Local embeddings model loaded successfully!


In [9]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# ‚úÖ Example documents (Replace with your actual dataset)
docs = [
    "Vector databases store high-dimensional vectors used for semantic search.",
    "Pinecone is a serverless vector database optimized for AI applications.",
    "Large Language Models (LLMs) use vector databases to improve retrieval accuracy."
]

# ‚úÖ Convert docs into a list of `Document` objects
documents = [Document(page_content=doc) for doc in docs]


# ‚úÖ Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
# ‚úÖ Use the correct method to split documents
split_docs = text_splitter.split_documents(documents)


print(f"‚úÖ Loaded and split {len(split_docs)} document chunks!")


‚úÖ Loaded and split 3 document chunks!


In [10]:

# ‚úÖ Store document vectors in Pinecone
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)
print("‚úÖ Documents successfully stored in Pinecone!")


‚úÖ Documents successfully stored in Pinecone!


In [11]:

# # ‚úÖ Run a similarity search query
# query = "What is a vector database?"
# results = vectorstore.similarity_search(query)

# # ‚úÖ Print retrieved results
# for i, doc in enumerate(results):
#     print(f"Result {i+1}: {doc.page_content}")

# ######### THIS WORKS

# import time

# print("‚è≥ Waiting for Pinecone index to be ready...")
# time.sleep(20)  # Allow Pinecone to fully initialize

# print("‚úÖ Running similarity search...")
# query = "What is a vector database?"
# results = vectorstore.similarity_search(query)

# for i, doc in enumerate(results):
#     print(f"Result {i+1}: {doc.page_content}")


#  ######################### WORKS TOO
# import time

# # Ensure index is populated before querying
# index_stats = vectorstore._index.describe_index_stats()
# total_vectors = index_stats.get("total_vector_count", 0)

# if total_vectors == 0:
#     print("‚è≥ No vectors found in index. Waiting for data to be indexed...")
#     time.sleep(20)  # Give Pinecone time to index the data

# print("‚úÖ Running similarity search...")
# query = "What is a vector database?"
# results = vectorstore.similarity_search(query)

# # Print results
# if results:
#     for i, doc in enumerate(results):
#         print(f"Result {i+1}: {doc.page_content}")
# else:
#     print("‚ö†Ô∏è No results found! Try running the search again.")


# #### WORKS  AS WELL
# import time

# # ‚úÖ Function to wait for Pinecone index readiness with retries
# def wait_for_index(vectorstore, max_retries=4, wait_time=5):
#     for attempt in range(max_retries):
#         index_stats = vectorstore._index.describe_index_stats()
#         total_vectors = index_stats.get("total_vector_count", 0)

#         if total_vectors > 0:
#             print(f"‚úÖ Index is ready with {total_vectors} vectors.")
#             return

#         print(f"‚è≥ No vectors found (Attempt {attempt+1}/{max_retries}). Waiting {wait_time}s before retrying...")
#         time.sleep(wait_time)

#     print("‚ö†Ô∏è Index is still empty after retries. Proceeding, but results may be incomplete.")

# # ‚úÖ Ensure index is populated before querying with retry logic
# wait_for_index(vectorstore, max_retries=4, wait_time=5)

# print("‚úÖ Running similarity search...")
# query = "What is a vector database?"
# results = vectorstore.similarity_search(query)

# # ‚úÖ Print results
# if results:
#     for i, doc in enumerate(results):
#         print(f"Result {i+1}: {doc.page_content}")
# else:
#     print("‚ö†Ô∏è No results found! Try running the search again.")




# Updated Code with Exponential Backoff
import time

# ‚úÖ Function to wait for Pinecone index readiness with exponential backoff
def wait_for_index(vectorstore, max_retries=5, initial_wait=5, max_wait=60):
    wait_time = initial_wait  # Start with 5 seconds
    total_wait = 0  # Track total wait time

    for attempt in range(1, max_retries + 1):
        index_stats = vectorstore._index.describe_index_stats()
        total_vectors = index_stats.get("total_vector_count", 0)

        if total_vectors > 0:
            print(f"‚úÖ Index is ready with {total_vectors} vectors.")
            return True  # Exit function early

        print(f"‚è≥ No vectors found (Attempt {attempt}/{max_retries}). Waiting {wait_time}s before retrying...")
        time.sleep(wait_time)

        # Double wait time for next attempt, but cap at max_wait
        total_wait += wait_time
        wait_time = min(wait_time * 2, max_wait)

        if total_wait >= max_wait:
            break  # Stop retrying if we've waited too long

    print("‚ö†Ô∏è Index is still empty after retries. Proceeding, but results may be incomplete.")
    return False  # Indicate that retries did not succeed

# ‚úÖ Ensure index is populated before querying with retry logic
wait_for_index(vectorstore, max_retries=5, initial_wait=5, max_wait=60)

print("‚úÖ Running similarity search...")
query = "What is a vector database?"
results = vectorstore.similarity_search(query)

# ‚úÖ Print results
if results:
    for i, doc in enumerate(results):
        print(f"Result {i+1}: {doc.page_content}")
else:
    print("‚ö†Ô∏è No results found! Try running the search again.")




‚úÖ Index is ready with 9 vectors.
‚úÖ Running similarity search...
Result 1: Vector databases store high-dimensional vectors used for semantic search.
Result 2: Vector databases store high-dimensional vectors used for semantic search.
Result 3: Vector databases store high-dimensional vectors used for semantic search.
Result 4: Large Language Models (LLMs) use vector databases to improve retrieval accuracy.



## üöÄ **Next Enhancements**
1. **Improve retrieval quality** ‚Äì Fine-tune embeddings for domain-specific knowledge.
2. **Optimize query performance** ‚Äì Implement vector caching strategies.
3. **Enhance batch processing** ‚Äì Improve bulk vector updates in Pinecone.
4. **Implement Hybrid Search** ‚Äì Combine **Vector + Keyword Search** for better accuracy.
5. **Use Re-Ranking models** ‚Äì Apply `cross-encoder` to improve ranking.
6. **Expand Data Sources** ‚Äì Integrate a more diverse document set.
7. **Integrate a Chatbot** ‚Äì Build an AI chatbot using the Pinecone knowledge base.

üîπ This notebook **fully integrates fixes for API limits, mismatched dimensions, deletion protection, and retrieval optimizations**.  
üí° Feel free to experiment and extend the pipeline with the listed enhancements! üéØ  
