In [2]:
# 載入數據集
from datasets import load_dataset
from haystack import Document

# 加載 PubMedQA 數據集，取前 1000 條數據
dataset = load_dataset(
    "vblagoje/PubMedQA_instruction",
    split="train"
)
dataset = dataset.select(range(1000))

# 提取文件
all_documents = [
    Document(content=doc["context"])
    for doc in dataset
]
# 提取問題
all_questions = [
    doc["instruction"]
    for doc in dataset
]
# 提取真實答案
all_ground_truth_answers = [
    doc["response"]
    for doc in dataset
]

Downloading readme: 100%|██████████| 498/498 [00:00<00:00, 1.15MB/s]
Downloading data: 100%|██████████| 274M/274M [00:27<00:00, 10.1MB/s] 
Downloading data: 100%|██████████| 986k/986k [00:01<00:00, 978kB/s]
Generating train split: 100%|██████████| 272458/272458 [00:00<00:00, 354140.92 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 426988.09 examples/s]


In [3]:
from typing import List
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

In [4]:
# 建立索引管道
indexing = Pipeline()

In [5]:
# 建立 `文件嵌入器`
document_embedder = SentenceTransformersDocumentEmbedder(
    model="sentence-transformers/all-MiniLM-L6-v2"
)

# 建立 `內存文件儲存` 對象
document_store = InMemoryDocumentStore()

# 使用儲存對象建立 `文件寫入器`
document_writer = DocumentWriter(
    document_store=document_store,
    # 重複時跳過
    policy=DuplicatePolicy.SKIP
)

In [6]:
# 添加管道元件
indexing.add_component(
    instance=document_embedder,
    name="document_embedder"
)
indexing.add_component(
    instance=document_writer,
    name="document_writer"
)

In [7]:
# 連接管道元件：連接嵌入器和寫入器
indexing.connect(
    "document_embedder.documents",
    "document_writer.documents"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x33a517550>
🚅 Components
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - document_embedder.documents -> document_writer.documents (List[Document])

In [8]:
# 執行索引管道
indexing.run(
    {"document_embedder": {"documents": all_documents}}
)

Batches: 100%|██████████| 32/32 [00:07<00:00,  4.46it/s]


{'document_writer': {'documents_written': 1000}}

In [9]:
indexing.draw('ex15-1.png')

In [10]:
import os
from getpass import getpass
from dotenv import load_dotenv

# 載入環境變數
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# 設置 OpenAI API 金鑰
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [11]:
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

In [12]:
# 定義生成答案的模板
template = """
        您必須僅根據給定的上下文資訊回答以下問題。

        上下文:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        問題: {{question}}
        答案:
        """

In [13]:
# 建立 RAG 管道
rag_pipeline = Pipeline()

In [14]:
rag_pipeline.add_component(
    "query_embedder", 
    SentenceTransformersTextEmbedder(
        model="sentence-transformers/all-MiniLM-L6-v2"
    )
)
rag_pipeline.add_component(
    "retriever",
    InMemoryEmbeddingRetriever(document_store, top_k=3)
)
rag_pipeline.add_component(
    "prompt_builder",
    PromptBuilder(template=template)
)
rag_pipeline.add_component(
    "generator",
    OpenAIGenerator(model="gpt-4-turbo")
)
rag_pipeline.add_component(
    "answer_builder",
    AnswerBuilder()
)

In [15]:
# 連接管道的組件
rag_pipeline.connect(
    "query_embedder", "retriever.query_embedding"
)
rag_pipeline.connect(
    "retriever", "prompt_builder.documents"
)
rag_pipeline.connect(
    "prompt_builder", "generator"
)
rag_pipeline.connect(
    "generator.replies", "answer_builder.replies"
)
rag_pipeline.connect(
    "generator.meta", "answer_builder.meta"
)
rag_pipeline.connect(
    "retriever", "answer_builder.documents"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x38bad2230>
🚅 Components
  - query_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - query_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - retriever.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> generator.prompt (str)
  - generator.replies -> answer_builder.replies (List[str])
  - generator.meta -> answer_builder.meta (List[Dict[str, Any]])