# Vector store and retrieval

In [13]:
from langchain_core.embeddings import Embeddings
from langchain_core.retrievers import BaseRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

## Vector store creation

In [5]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

In [7]:
loader = TextLoader("facts.txt")
docs = loader.load_and_split(
    text_splitter=text_splitter
)

Run the below db line only once. If you run it mutiple times then the vector store is going to end up having mutiple copies of the same documents

In [8]:
db = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="emb"
)

## Retrieval from vector store

In [9]:
results = db.similarity_search(
    "What is an interesting fact about the English language?"
)

In [10]:
for result in results:
    print("\n")
    print(result.page_content)



4. A snail can sleep for three years.
5. The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'
6. The elephant is the only mammal that can't jump.


1. "Dreamt" is the only English word that ends with the letters "mt."
2. An ostrich's eye is bigger than its brain.
3. Honey is the only natural food that is made without destroying any kind of life.


56. An apple, onion, and potato all have the same taste. The differences in flavor are caused by their smell.
57. Ancient Egyptians used a form of toothpaste over 5000 years ago.


76. The word "OK" stands for "oll korrect," a deliberate misspelling of "all correct."
77. The only letter that doesn’t appear on the Periodic Table is J.
78. Sheep don’t drink from running water.


## Custom docuemnt retriever class

In [None]:
class RedundantFilterRetriever(BaseRetriever):
    embeddings: Embeddings
    chroma: Chroma

    def _get_relevant_documents(self, query):
        # calculate embeddings for the 'query' string
        emb = self.embeddings.embed_query(query)

        # take embeddings and feed them into that
        # max_marginal_relevance_search_by_vector
        return self.chroma.max_marginal_relevance_search_by_vector(
            embedding=emb,
            lambda_mult=0.8
        )

    async def _aget_relevant_documents(self, query):
        # calculate embeddings for the 'query' string
        emb = await self.embeddings.aembed_query(query)

        # take embeddings and feed them into that
        # amax_marginal_relevance_search_by_vector
        return await self.chroma.amax_marginal_relevance_search_by_vector(
            embedding=emb,
            lambda_mult=0.8
        )

## Prompt with custom retrieval from vector store

In [None]:
chat = ChatOpenAI(model_name="gpt-5-nano")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

db = Chroma(
    persist_directory="emb",
    embedding_function=embeddings
)

retriever = RedundantFilterRetriever(
    embeddings=embeddings,
    chroma=db
)

In [23]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [24]:
# Create prompt template
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [25]:
# Create LCEL chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chat
    | StrOutputParser()
)

In [26]:
result = chain.invoke("What is an interesting fact about the English language?")
print(result)

The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'


# Old way of doing this

In [None]:
import langchain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.schema import BaseRetriever
from langchain_openai import OpenAIEmbeddings

from dotenv import load_dotenv

In [None]:
load_dotenv()

## Vector store creation

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

loader = TextLoader("facts.txt")
docs = loader.load_and_split(
    text_splitter=text_splitter
)


In [None]:
db = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory="emb"
)

## Retrieval from vector store

In [None]:
results = db.similarity_search(
    "What is an interesting fact about the English language?"
)

for result in results:
    print("\n")
    print(result.page_content)

## Custom docuemnt retriever class

In [None]:
class RedundantFilterRetriever(BaseRetriever):
    embeddings: Embeddings
    chroma: Chroma

    def get_relevant_documents(self, query):
        # calculate embeddings for the 'query' string
        emb = self.embeddings.embed_query(query)

        # take embeddings and feed them into that
        # max_marginal_relevance_search_by_vector
        return self.chroma.max_marginal_relevance_search_by_vector(
            embedding=emb,
            lambda_mult=0.8
        )

    async def aget_relevant_documents(self):
        return []

## Prompt with custom retrieval from vector store

In [None]:
langchain.debug = True

chat = ChatOpenAI()
embeddings = OpenAIEmbeddings()

In [None]:
db = Chroma(
    persist_directory="emb",
    embedding_function=embeddings
)
retriever = RedundantFilterRetriever(
    embeddings=embeddings,
    chroma=db
)

In [None]:
chain = RetrievalQA.from_chain_type(
    llm=chat,
    retriever=retriever,
    chain_type="stuff"
)

In [None]:
result = chain.run("What is an interesting fact about the English language?")

print(result)