Multimodal data in vector database, different vector database,PineCone, langchain hub,indexing

In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [None]:
embedding.embed_query("This is a test query")

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Initialize embeddings
google_embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
google_embedding.embed_query("This is a test query")

In [None]:
from pinecone import Pinecone
pine_cone_api = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pine_cone_api)

In [None]:
from pinecone import ServerlessSpec

In [None]:
my_pinecone_index= "my-pinecone-index"

In [None]:
pc.create_index(name=my_pinecone_index,
                dimension=768,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1",
                ))

In [None]:
pinecone_index= pc.Index(my_pinecone_index)

In [None]:
from langchain_pinecone import PineconeVectorStore

In [None]:
pinecone_vector_store = PineconeVectorStore(
    index=pinecone_index,
    embedding=google_embedding)

In [None]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

In [None]:
documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10
]

In [None]:
from uuid import uuid4 
uuids = [str(uuid4()) for _ in range(len(documents))]

In [None]:
pinecone_vector_store.add_documents(documents=documents,uuids=uuids)

In [None]:
results = pinecone_vector_store.similarity_search("what is langchain?", filter={"source": "tweet"})

In [None]:
pinecone_retriever = pinecone_vector_store.as_retriever(search_kwargs={"k": 2})

In [None]:
pinecone_retriever.invoke(
    "What is the best framework for building stateful, agentic applications?")

[Document(id='66fcd1eb-f04a-4ca5-a23d-87d74b76d88e', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='91e9cbee-e051-4d79-aa8e-356992f5d71d', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
google_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [None]:
from langchain import hub
prompt=hub.pull("rlm/rag-prompt")

In [None]:
import pprint

In [None]:
from langchain_core.prompts import PromptTemplate

In [None]:
pinecone_prompt= PromptTemplate(
    template="You are an assistant that answers questions based on the provided context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:",
    input_variables=["context", "question"]
)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
prompt.invoke(
    {
        "question": "What is the best framework for building stateful, agentic applications?",
        "context": "langchain"},
    output_parser=StrOutputParser(),
    run_manager=RunnablePassthrough()
)

ChatPromptValue(messages=[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: What is the best framework for building stateful, agentic applications? \nContext: langchain \nAnswer:", additional_kwargs={}, response_metadata={})])

In [None]:
rag_chain = (
    {
        "context": pinecone_retriever,
        "question": RunnablePassthrough()
    }
    | pinecone_prompt
    | google_model
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("what is llm model")

'The provided text does not contain information about LLMs (Large Language Models).  Therefore, I cannot answer your question based on the given context.'

In [None]:
def custom_rag_chain(question: str) -> str:
    # Get relevant documents from Pinecone
    retrieved_docs = pinecone_retriever.invoke(question)
    
    # Extract and format the context from retrieved documents
    context = "\n".join([doc.page_content for doc in retrieved_docs])
    
    # Format the prompt with context and question
    formatted_prompt = pinecone_prompt.format(
        context=context,
        question=question
    )
    
    # Get response from the model
    response = google_model.invoke(formatted_prompt)
    
    # Parse and return the final answer
    return StrOutputParser().invoke(response)

In [None]:
# Test the custom RAG chain
result = custom_rag_chain("What is the best framework for building AI applications?")
print(result)

Based on the provided text, LangGraph is the best framework for building stateful, agentic applications.  The text doesn't specify if this makes it the *best* for all AI applications, only those that are stateful and agentic.
