In [1]:
import os

from haystack import Pipeline, Document
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = []
train_data_hava_dups = []
# with open("input_52_ch_en.txt", "r", encoding="utf-8") as f:
with open("input.txt", "r", encoding="utf-8") as f:
    for line in f:
        train_data_hava_dups.append(line.strip())
# remove duplicates in train_data
train_data_hava_dups = list(set(train_data_hava_dups))
for line in train_data_hava_dups:
    train_data.append(Document(content=line))

# train_data= [
#         Document(content="The meaning of 一洗万古 is that【清】詹事府司经局洗马之谑称，意指官至此，其后之升阶无望。清何刚德《话梦集》卷上：“三铨选格失调停，鲇竹功免滞典经。”自注：“‘一洗万古’，京曹谑语也。翰林转到詹事府司经局洗马，升阶便滞。人因‘一洗万古凡马空’之句，嘲之曰‘一洗万古’，亦以清朝不立东宫，故官制不甚介意也。”"),
# ]

In [3]:
document_store = InMemoryDocumentStore()
document_store.write_documents(
    train_data
)

1

In [4]:
prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""

retriever = InMemoryBM25Retriever(document_store=document_store)
prompt_builder = PromptBuilder(template=prompt_template)
llm = OllamaGenerator(
    model="phi3:medium",
    url="http://localhost:11434/api/generate",
    generation_kwargs={
        "num_predict": 1000,
        "temperature": 0.1,
    },
)

rag_pipeline = Pipeline()
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000022AE80E54B0>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [5]:
# question = "Based on the documents, please reply to me in Chinese. What is the meaning of 七佐?"
# question = "用中文回答。根據這些文件，阐思是哪里人？并且告诉我资料来源"
# question = "Tell me where 阐思 was from, and tell me the source"
question = "Tell me the informaiton of 段生珖遠. Tell me the source you found in the document"
results = rag_pipeline.run(
    {
        "retriever": {"query": question},
        "prompt_builder": {"question": question},
    }
)

print(results["llm"]["replies"][0])


 段生珖遠, also known as公�n�，is a character from the documents provided. He is described as an individual with strong moral values and good education. He was born in the Chenxi area of Hunan province and later moved to Guizhou province's Tongren County (now part of Renhuai City). 

段生珖遠 has two sons, 正宸 and 正栻. During a time when their hometown was occupied by rebels, the elder son, 正宸, sacrificed his life to save his father from being killed by the rebels. The younger son, 正栻, also showed great courage and selflessness in protecting his family.

The source of this information is found in a document titled "張扶翼紀異錄" (Zhang Fuying's Record of Strange Events).
