In [9]:
import os

from haystack import Pipeline, Document
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator

In [10]:
train_data = []
train_data_hava_dups = []
with open("input.txt", "r", encoding="utf-8") as f:
    for line in f:
        train_data_hava_dups.append(line.strip())
# remove duplicates in train_data
train_data_hava_dups = list(set(train_data_hava_dups))
for line in train_data_hava_dups:
    train_data.append(Document(content=line))

In [11]:
document_store = InMemoryDocumentStore()
document_store.write_documents(
    train_data
)

1

In [12]:
# read api_key.txt to get the API key
with open("api_key.txt", "r") as f:
    api_key = f.readline().strip()

os.environ['GROQ_API_KEY'] = api_key

In [13]:
retriever = InMemoryBM25Retriever(document_store=document_store)
prompt_template = """
According to the contents of this website:
{% for document in documents %}
  {{document.content}}
{% endfor %}
Answer the given question: {{query}}
Answer:
"""
prompt_builder = PromptBuilder(template=prompt_template)
llm = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="llama3-70b-8192",
    generation_kwargs = {"max_tokens": 1024}
)


In [14]:

pipe = Pipeline()


pipe.add_component("retriever", InMemoryBM25Retriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=prompt_template))
pipe.add_component("llm", llm)
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x000001BC0C625EA0>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [22]:
# query = "Based on the documents, are the author named 陳鼎 of 滇黔土司㛰禮記 and the author of 滇黔紀游 the same person? Please provide your reasons."
# query = "用中文回答。根據這些文件，特別是他們生平，譬如 courtesy name, style name、著述，哪裡人的資訊告訴我，滇黔土司㛰禮記的作者陳鼎和滇黔紀遊的作者是同一個人嗎？請提供你的理由。"
# query = "用中文回答。根據這些文件，吴阐思是哪里人？并且告诉我资料来源"
# query = "Tell me the informaiton of 段生珖遠. Tell me the source you found in the document"
query = "Base on the documents, tell me the father of 長正宸. Tell me the source you found in the document"
res=pipe.run({
    "prompt_builder": {
        "query": query
    },
    "retriever": {
        "query": query
    }
})
print(res["llm"]["replies"][0])

According to the document, the father of 長正宸 (ZhángZhèngChén) is 段生珖遠 (Duàn Shēng Jiǎo Yuǎn).

The source I found in the document is:

「段生珖遠，字公亨，黔陽子弟鄕人，余初涖黔時所首拔士也。行篤而文優，生二子，長正宸，次正栻。」

Which means: "Duàn Shēng Jiǎo Yuǎn, also known as Gōng Hēng, was a native of Qián Yáng, and was one of the first scholars I recruited when I first governed Qián. He was upright and talented, and had two sons, the elder being ZhángZhèngChén and the younger being ZhángZhèngYí."
