### Document Loaders

In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [2]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./bella_vista.txt")
docs = loader.load()



In [3]:
print(docs)
print(len(docs))

[Document(metadata={'source': './bella_vista.txt'}, page_content="Q: What are the hours of operation for Bella Vista?\nA: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome guests from 12 p.m. to 10 p.m.\n\nQ: What type of cuisine does Bella Vista serve?\nA: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.\n\nQ: Do you offer vegetarian or vegan options at Bella Vista?\nA: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and vegan dishes. Our chefs are also happy to customize dishes based on dietary needs.\n\nQ: Is Bella Vista family-friendly?\nA: Yes, Bella Vista is a family-friendly establishment. We have a dedicated kids' menu and offer high chairs and booster seats for our younger guests.\n\nQ: Can I book private events at Bella Vista?\nA: Certainly! Bella Vista has a private dini

In [4]:
from langchain.schema import Document

example_doc = Document(page_content="test", metadata={"important_info": "hi there"})
example_doc

Document(metadata={'important_info': 'hi there'}, page_content='test')

Texts are not loaded 1:1 into the database, but in pieces, so called "chunks". You can define the chunk size and the overlap between the chunks.

To create multiple documents (chunks), you can use a text splitter.

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
)
documents = text_splitter.split_documents(docs)

In [6]:
for doc in documents:
    print(doc)

len(documents)

page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='A: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome' metadata={'source': './bella_vista.txt'}
page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}
page_content='Q: What type of cuisine does Bella Vista serve?' metadata={'source': './bella_vista.txt'}
page_content='A: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We' metadata={'source': './bella_vista.txt'}
page_content='cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.' metadata={'source': './bella_vista.txt'}
page_content='Q: Do you offer vegetarian or vegan options at Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='A: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and veg

22

### Embeddings

Texts are not stored as text in the database, but as vector representations. 
Embeddings are a type of word representation that represents the semantic meaning of words in a vector space.

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [8]:
embedding1 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
print(embedding1)
print(len(embedding1))

[0.02499602735042572, 0.021089626476168633, -0.010334658436477184, -0.022412363439798355, -0.04037439450621605, 0.022622518241405487, -0.005476380232721567, -0.016169536858797073, -0.0012493378017097712, -0.009382782503962517, 0.00901810172945261, 0.005349669139832258, -0.014438852667808533, 0.0019856514409184456, -0.0060697575099766254, -0.015551435761153698, 0.011082561686635017, -0.004079470410943031, 0.0064715235494077206, -0.003609713166952133, 0.0022220751270651817, -0.007083444390445948, -0.011614128947257996, -0.0002339128259336576, 0.017220310866832733, -0.004258719738572836, 0.011657396331429482, -0.0318693183362484, 0.02291920594871044, -0.009339515119791031, 0.020310817286372185, -0.01064370945096016, -0.019791612401604652, -0.01552671194076538, 0.015600884333252907, 0.004135099705308676, 0.007312141824513674, 0.010112142190337181, -0.032091833651065826, 0.0043298015370965, 0.01142869796603918, 0.03389669209718704, -0.004515232052654028, -0.005207506008446217, -0.0127452546

In [9]:
embedding2 = embeddings.embed_query(text="The solar system consists of the Sun and the objects that orbit it")
embedding3 = embeddings.embed_query(text="Planets, asteroids, and comets are part of our solar system.")
embedding4 = embeddings.embed_query(text="I love baking chocolate chip cookies on weekends.")

In [10]:
import numpy as np

def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    return dot_product / (norm_a * norm_b)

In [11]:
sim_1_2 = cosine_similarity(embedding1, embedding2)
sim_1_3 = cosine_similarity(embedding1, embedding3)
sim_3_4 = cosine_similarity(embedding3, embedding4)

print(sim_1_2, sim_1_3, sim_3_4)


0.9999948306841377 0.9003292492102601 0.6963035596148098


### Loading Vectors into VectorDB (FAISS)

As created by OpenAIEmbeddings vectors and documents can now be stored in the database. This DB can be stored as .pkl file

In [12]:
from langchain.vectorstores.faiss import FAISS

vectorstore = FAISS.from_documents(documents, embeddings)

vectorstore.save_local("index") # newer FAISS versions can not be serialized with pickle

### Loading the database

Before using the database, it must of course be loaded again.

In [13]:
# Argument 'allow_dangerous_deserialization' was added due to people adding malware in pkl files -> no worry
FAISS.load_local("index", embeddings,allow_dangerous_deserialization=True)

<langchain_community.vectorstores.faiss.FAISS at 0x11ac16310>

In [14]:
retriever = vectorstore.as_retriever()

In [15]:
docs = retriever.invoke(input="When are the opening hours??")
for doc in docs:
    print(doc)


page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}
page_content='A: While walk-ins are always welcome, we recommend making a reservation, especially during weekends' metadata={'source': './bella_vista.txt'}
page_content='during weekends and holidays, to ensure a seamless dining experience.' metadata={'source': './bella_vista.txt'}


In [16]:
docs = retriever.invoke(input="When are the opening hours?", filter={'source': './bella_vista.txt'}, k=3)
for doc in docs:
    print(doc) # does not work!

page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}
page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}
page_content='A: While walk-ins are always welcome, we recommend making a reservation, especially during weekends' metadata={'source': './bella_vista.txt'}


In [17]:
retriever = vectorstore.as_retriever(search_kwargs={"filter": {'source': './bella_vista.txt'}, "k":1})
docs = retriever.invoke(input="When are the opening hours??")
for doc in docs:
    print(doc)

page_content='Q: What are the hours of operation for Bella Vista?' metadata={'source': './bella_vista.txt'}


### Now we have to pass the documents to an LLM.

We create a prompt with a question and context. Context is the output from the retriever (Document Store).
LangChain provides chains out-of-the-box to do that, the RetrievalChains

### Old way (deprecated)

In [18]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

prompt_template = """You are a helpful assistant for our restaurant.

{context}

Question: {question}
Answer here:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

llm = ChatOpenAI(model="gpt-4o-mini")
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
)

result = qa.invoke(input="When are the opening hours on sunday??")
print(result)

{'query': 'When are the opening hours on sunday??', 'result': 'The opening hours on Sunday are from 12 p.m. to 10 p.m.'}


### New Way

In [19]:
from langchain_openai import ChatOpenAI
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import PromptTemplate

prompt_template = """You are a helpful assistant for our restaurant.

{context}

Question: {input}
Answer here:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "input"]
)


llm = ChatOpenAI(model="gpt-4o-mini")
combine_docs_chain = create_stuff_documents_chain(llm, PROMPT)
qa = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=combine_docs_chain
)

result = qa.invoke({"input": "When are the opening hours on sunday??"})
print(result)

{'input': 'When are the opening hours on sunday??', 'context': [Document(id='fed2aceb-bfe3-40e4-b0af-287e0e7bc508', metadata={'source': './bella_vista.txt'}, page_content='Sundays, we welcome guests from 12 p.m. to 10 p.m.')], 'answer': 'The opening hours on Sunday are from 12 p.m. to 10 p.m.'}
