In [1]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=120,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)

embedding_function = OpenAIEmbeddings()
model = ChatOpenAI(model="gpt-4o-mini")

db = Chroma.from_documents(chunks, embedding_function)
retriever = db.as_retriever()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


In [2]:
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
import re

query = "Who owns the restaurant?"


QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative question like this:
    <<question1>>
    <<question2>>
    Only provide the query, no numbering.
    Original question: {question}""",
)


def split_and_clean_text(input_text):
    return [item for item in re.split(r"<<|>>", input_text) if item.strip()]

In [3]:
model = ChatOpenAI(model="gpt-4o-mini")
multiquery_chain = (
    QUERY_PROMPT | model | StrOutputParser() | RunnableLambda(split_and_clean_text)
)

In [4]:
list_of_questions = multiquery_chain.invoke(query)

In [5]:
list_of_questions

['What is the ownership information for the restaurant?',
 'Can you tell me who the proprietor of the restaurant is?',
 'Who is the owner of the restaurant establishment?',
 'Who holds the ownership rights for the restaurant?',
 "What details can you provide about the restaurant's ownership?"]

In [6]:
docs = [retriever.invoke(q) for q in list_of_questions]

In [7]:
docs

[[Document(metadata={'source': 'data/founder.txt'}, page_content='Creating Chef Amico’s Restaurant'),
  Document(metadata={'source': 'data/restaurant.txt'}, page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.'),
  Document(metadata={'source': 'data/founder.txt'}, page_content='Philosophy of Hospitality'),
  Document(metadata={'source': 'data/founder.txt'}, page_content='the restaurant quickly gained fame for its authentic flavors and Amico’s innovative twists on traditional recipes.')],
 [Document(metadata={'source': 'data/founder.txt'}, page_content='Creating Chef Amico’s Restaurant'),
  Document(metadata={'source': 'data/restaurant.txt'}, page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.'),
  Document(metadata={'source': 'data/restaurant.txt'}, page_content="into Chef Amico. Her mission was to uncover the secret behind t

In [8]:
def flatten_and_unique_documents(documents):
    flattened_docs = [doc for sublist in documents for doc in sublist]

    unique_docs = []
    unique_contents = set()
    for doc in flattened_docs:
        if doc.page_content not in unique_contents:
            unique_docs.append(doc)
            unique_contents.add(doc.page_content)

    return unique_docs

In [9]:
flatten_and_unique_documents(documents=docs)

[Document(metadata={'source': 'data/founder.txt'}, page_content='Creating Chef Amico’s Restaurant'),
 Document(metadata={'source': 'data/restaurant.txt'}, page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.'),
 Document(metadata={'source': 'data/founder.txt'}, page_content='Philosophy of Hospitality'),
 Document(metadata={'source': 'data/founder.txt'}, page_content='the restaurant quickly gained fame for its authentic flavors and Amico’s innovative twists on traditional recipes.'),
 Document(metadata={'source': 'data/restaurant.txt'}, page_content="into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico"),
 Document(metadata={'source': 'data/founder.txt'}, page_content='and relish life’s simple pleasures. His restaurant was a haven where strangers became friends over plates of arancini'),
 Document(metadata={'source': 'data/founder.txt'}, page_

In [10]:
HYDE_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five hypothetical answers to the user's query. These answers should offer diverse perspectives or interpretations, aiding in a comprehensive understanding of the query. Present the hypothetical answers as follows:

    <<Answer considering a specific perspective>>
    <<Answer from a different angle>>
    <<Answer exploring an alternative possibility>>
    <<Answer providing a contrasting viewpoint>>
    <<Answer that includes a unique insight>>

    Note: Present only the hypothetical answers, without numbering (or "-", "1.", "*") and so on, to provide a range of potential interpretations or solutions related to the query.
    Original question: {question}""",
)

In [11]:
hyde_chain = (
    HYDE_PROMPT | model | StrOutputParser() | RunnableLambda(split_and_clean_text)
)

In [12]:
list_of_questions = hyde_chain.invoke("Who is the owner of the restaurant")
list_of_questions

['The owner of the restaurant is likely an individual entrepreneur who took a personal interest in the culinary arts and decided to bring their vision to life, often investing their own savings to create a unique dining experience in the community.',
 'The restaurant could be owned by a family or a partnership, where multiple members contribute their skills and resources to manage and operate the business, fostering a sense of tradition and shared responsibility.',
 'An alternative possibility is that the restaurant is part of a larger franchise or chain, in which case the owner may be a corporate entity that oversees multiple locations, prioritizing brand consistency and operational efficiency over local flavor.',
 'From a contrasting viewpoint, the restaurant might be cooperative-owned, where the staff and patrons have a stake in the establishment, promoting a collaborative atmosphere and shared decision-making in its operations and community engagement.',
 'A unique insight might su

In [13]:
docs = [retriever.invoke(q) for q in list_of_questions]
flatten_and_unique_documents(documents=docs)

[Document(metadata={'source': 'data/founder.txt'}, page_content='Creating Chef Amico’s Restaurant'),
 Document(metadata={'source': 'data/founder.txt'}, page_content='the restaurant quickly gained fame for its authentic flavors and Amico’s innovative twists on traditional recipes.'),
 Document(metadata={'source': 'data/founder.txt'}, page_content='young chefs, shares his knowledge at culinary workshops, and supports local farmers and producers.'),
 Document(metadata={'source': 'data/restaurant.txt'}, page_content='One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico.'),
 Document(metadata={'source': 'data/restaurant.txt'}, page_content='the food. It was about the stories, the traditions, and the heart poured into every dish.'),
 Document(metadata={'source': 'data/founder.txt'}, page_content='and traditions that would later influence his unique culinary style.'),
 Document(metadata={'source': 'data/founder.txt'}, page_cont