In [1]:
import os

from haystack import Pipeline, Document
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = []
train_data_hava_dups = []
# with open("input_52_ch_en.txt", "r", encoding="utf-8") as f:
with open("input.txt", "r", encoding="utf-8") as f:
    for line in f:
        train_data_hava_dups.append(line.strip())
# remove duplicates in train_data
train_data_hava_dups = list(set(train_data_hava_dups))
for line in train_data_hava_dups:
    train_data.append(Document(content=line))

# train_data= [
#         Document(content="The meaning of 一洗万古 is that【清】詹事府司经局洗马之谑称，意指官至此，其后之升阶无望。清何刚德《话梦集》卷上：“三铨选格失调停，鲇竹功免滞典经。”自注：“‘一洗万古’，京曹谑语也。翰林转到詹事府司经局洗马，升阶便滞。人因‘一洗万古凡马空’之句，嘲之曰‘一洗万古’，亦以清朝不立东宫，故官制不甚介意也。”"),
# ]

In [3]:
document_store = InMemoryDocumentStore()
document_store.write_documents(
    train_data
)

62

In [4]:
prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""

retriever = InMemoryBM25Retriever(document_store=document_store)
prompt_builder = PromptBuilder(template=prompt_template)
llm = OllamaGenerator(
    model="phi3:medium",
    url="http://localhost:11434/api/generate",
    generation_kwargs={
        "num_predict": 1000,
        "temperature": 0.1,
    },
)

rag_pipeline = Pipeline()
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x00000240F9DC9DE0>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [7]:
# question = "Based on the documents, please reply to me in Chinese. What is the meaning of 七佐?"
question = "Based on the documents, are the author named 陳鼎 of 滇黔土司㛰禮記 and the author of 滇黔紀游 the same person? Please provide your reasons."
results = rag_pipeline.run(
    {
        "retriever": {"query": question},
        "prompt_builder": {"question": question},
    }
)

print(results["llm"]["replies"])


[' 根据文档，身言书判是指唐朝选官时的考核内容。即通过礼部试后，进士及第者不能直接获得官位，还需要再通过吏部选官一阶段。这个阶段包括书判、身言两项考查。书判是指书法和文章的评价；身言则是对候选人的外表和言语进行评估，要求其体态丰伟，言辞正确。如果四者都合格，才能通过注册、推举等程序，最终由吏部上报给尚书仆射，再经门下省反复审核。']
