In [None]:
# 載入數據集
from datasets import load_dataset
from haystack import Document

# 加載 PubMedQA 數據集，取前 1000 條數據
dataset = load_dataset(
    "vblagoje/PubMedQA_instruction",
    split="train"
)
dataset = dataset.select(range(1000))

# 提取文件
all_documents = [
    Document(content=doc["context"])
    for doc in dataset
]
# 提取問題
all_questions = [
    doc["instruction"]
    for doc in dataset
]
# 提取真實答案
all_ground_truth_answers = [
    doc["response"]
    for doc in dataset
]

In [None]:
from typing import List
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

In [None]:
# 建立索引管道
indexing = Pipeline()

In [None]:
# 建立 `文件嵌入器`
document_embedder = SentenceTransformersDocumentEmbedder(
    model="sentence-transformers/all-MiniLM-L6-v2"
)

# 建立 `內存文件儲存` 對象
document_store = InMemoryDocumentStore()

# 使用儲存對象建立 `文件寫入器`
document_writer = DocumentWriter(
    document_store=document_store,
    # 重複時跳過
    policy=DuplicatePolicy.SKIP
)

In [None]:
# 添加管道元件
indexing.add_component(
    instance=document_embedder,
    name="document_embedder"
)
indexing.add_component(
    instance=document_writer,
    name="document_writer"
)

In [None]:
# 連接管道元件：連接嵌入器和寫入器
indexing.connect(
    "document_embedder.documents",
    "document_writer.documents"
)

In [None]:
# 執行索引管道
indexing.run(
    {"document_embedder": {"documents": all_documents}}
)

In [None]:
indexing.draw('ex15-1.png')

In [None]:
import os
from getpass import getpass
from dotenv import load_dotenv

# 載入環境變數
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# 設置 OpenAI API 金鑰
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [None]:
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

In [None]:
# 定義生成答案的模板
template = """
        您必須僅根據給定的上下文資訊回答以下問題。

        上下文:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        問題: {{question}}
        答案:
        """

In [None]:
# 建立 RAG 管道
rag_pipeline = Pipeline()

In [None]:
# 使用多語言支持的嵌入模型
# rag_pipeline.add_component(
#     "query_embedder",
#     SentenceTransformersTextEmbedder(
#         model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
#     )
# )

# 改用支持多語言的嵌入模型
rag_pipeline.add_component(
    # 這是新的嵌入模型名稱
    "multi_language_embedder", 
    SentenceTransformersTextEmbedder(
        # 使用新的嵌入模型
        model="sentence-transformers/all-MiniLM-L6-v2"
    )
)

rag_pipeline.add_component(
    "retriever",
    InMemoryEmbeddingRetriever(document_store, top_k=3)
)
rag_pipeline.add_component(
    "prompt_builder",
    PromptBuilder(template=template)
)
rag_pipeline.add_component(
    "generator",
    OpenAIGenerator(model="gpt-4-turbo")
)
rag_pipeline.add_component(
    "answer_builder",
    AnswerBuilder()
)

In [None]:
# 連接管道的組件
rag_pipeline.connect(
    "query_embedder", "retriever.query_embedding"
)
rag_pipeline.connect(
    "retriever", "prompt_builder.documents"
)
rag_pipeline.connect(
    "prompt_builder", "generator"
)
rag_pipeline.connect(
    "generator.replies", "answer_builder.replies"
)
rag_pipeline.connect(
    "generator.meta", "answer_builder.meta"
)
rag_pipeline.connect(
    "retriever", "answer_builder.documents"
)

In [None]:
# 問題
question = "Do high levels of procalcitonin in the early phase after pediatric liver transplantation indicate poor postoperative outcome?"
# question = "小兒肝移植術後早期降鈣素原高是否表示術後效果不佳？"

# 運行管道
response = rag_pipeline.run(
    {
        "query_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "answer_builder": {"query": question}
    }
)
# 輸出
print(response["answer_builder"]["answers"][0].data)