In [1]:
import os

from haystack import Pipeline, Document
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever 
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = []
train_data_hava_dups = []
with open("input.txt", "r", encoding="utf-8") as f:
    for line in f:
        train_data_hava_dups.append(line.strip())
# remove duplicates in train_data
train_data_hava_dups = list(set(train_data_hava_dups))
for line in train_data_hava_dups:
    train_data.append(Document(content=line))

In [3]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(train_data)["documents"]
document_store.write_documents(documents_with_embeddings)

Batches: 100%|██████████| 2/2 [00:24<00:00, 12.33s/it]


62

In [4]:
# read api_key.txt to get the API key
with open("api_key.txt", "r") as f:
    api_key = f.readline().strip()

os.environ['GROQ_API_KEY'] = api_key

In [5]:
retriever = InMemoryEmbeddingRetriever (document_store=document_store)
prompt_template = """
According to the contents of this website:
{% for document in documents %}
  {{document.content}}
{% endfor %}
Answer the given question: {{query}}
Answer:
"""
prompt_builder = PromptBuilder(template=prompt_template)
llm = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="llama3-70b-8192",
    generation_kwargs = {"max_tokens": 1024}
)


In [6]:

pipe = Pipeline()

pipe.add_component("text_embedder", SentenceTransformersTextEmbedder())
pipe.add_component("retriever", InMemoryEmbeddingRetriever (document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=prompt_template))
pipe.add_component("llm", llm)
pipe.connect("text_embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000017E155FB9A0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [7]:
# query = "Based on the documents, are the author named 陳鼎 of 滇黔土司㛰禮記 and the author of 滇黔紀游 the same person? Please provide your reasons."
# query = "用中文回答。根據這些文件，特別是他們生平，譬如 courtesy name, style name、著述，哪裡人的資訊告訴我，滇黔土司㛰禮記的作者陳鼎和滇黔紀遊的作者是同一個人嗎？請提供你的理由。"
# query = "用中文回答。根據這些文件，吴阐思是哪里人？并且告诉我资料来源"
# query = "Tell me the informaiton of 段生珖遠. Tell me the source you found in the document"
query = "Base on the documents, tell me the father of 長正宸. Tell me the source you found in the document"
res=pipe.run({
    "prompt_builder": {
        "query": query
    },
    "text_embedder": {
        "text": query
    }
})
print(res["llm"]["replies"][0])

Batches: 100%|██████████| 1/1 [00:00<00:00, 10.47it/s]


According to the document, the father of 長正宸 is 中憲公. The source I found in the document is:

"劉崑字西來吉安人進士 謹案庭聞錄崑 爲順治己亥進士 康熙初官雲南府同知 謹案庭聞錄崑於 康熙十一年山束 鹿縣知縣 擢是職 吳三桂叛崑執節不屈安置騰越後大兵 定滇擢常德府知府 騰越 州志 謹案謝聖綸滇黔志畧引南中雜說凡四條但云撰 於 國初不言崑作說中有康熙十二年予嘗入逆藩便坐 　語蓋作於三桂旣叛以後也惟劉健庭聞錄自序稱 其父先中憲公居永昌日曾著吳三桂傳及滇變記 二種己未歲封稾於壁中人寶臺山避兵踰年返求 故居滿目蓬蒿"

This passage mentions that the father of 劉健 (also known as 長正宸) is 中憲公.
