<a href="https://colab.research.google.com/github/ruparee/rag-pipeline-tutorial-notebook/blob/main/rag-pipeline-tutorial-notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/ruparee/rag-pipeline-tutorial.git

In [None]:
!pip install langchain_community langchain_pinecone langchain_openai unstructured langchainhub langchain_text_splitters

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import os
import glob

In [None]:
loader = DirectoryLoader('rag-pipeline-tutorial', glob="**/*.md", show_progress=True, use_multithreading=True)

In [None]:
docs = loader.load()

In [None]:
docs

In [None]:
from google.colab import userdata
# Set the API keys
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')

In [None]:
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone client
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

# Set the name of your Pinecone Index here
index_name = 'rag-pipeline-tutorial'

# Disable deletion protection
pc.configure_index(
    index_name,
    deletion_protection="disabled"  # Disable protection
)
print(f"Deletion protection disabled for index: {index_name}")

# Check if index exists before deleting
existing_indexes = [index["name"] for index in pc.list_indexes()]
if index_name in existing_indexes:
    print(f"Deleting existing index: {index_name}")
    pc.delete_index(index_name)
else:
    print(f"Index {index_name} does not exist, skipping delete.")

# Create the Pinecone index
pc.create_index(
    name=index_name,
    # dimension=3072,
    dimension=384,
    metric='euclidean',
    deletion_protection='enabled',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

index = pc.Index(index_name)

# Run a sanity check on the index
index.describe_index_stats()

In [None]:
# # Initialize embeddings and the vector store
# embeddings = OpenAIEmbeddings(
#     model='text-embedding-3-large'
# )

# # Split the documents into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# split_docs = text_splitter.split_documents(docs)

# # Create a vector store for the documents using the specified embeddings
# vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)


# Run a sanity check on the index
index.describe_index_stats()

# Load a local embedding model (384D)
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")

# Wrapper to match LangChain format
class LocalEmbeddings:
    def embed_documents(self, texts):
        return embeddings_model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return embeddings_model.encode([text], convert_to_numpy=True).tolist()

# Use local embeddings with Pinecone
embeddings = LocalEmbeddings()

# Create the vector store
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

print("Documents successfully stored in Pinecone!")

# Ask a query that is likely to score a hit against your corpus of text or data
# In the of this example project, There's a blog post about vector databases
query = "What is a vector database?"
vectorstore.similarity_search(query)

In [None]:
# Convenience cell: ask additional arbitrary questions of the existing vectorstore that
# was created in previous cells
query = "What are some kubernetes best practices?"

vectorstore.similarity_search(query)