In [1]:
# 載入數據集
from datasets import load_dataset
from haystack import Document

# 加載 PubMedQA 數據集，取前 1000 條數據
dataset = load_dataset(
    "vblagoje/PubMedQA_instruction",
    split="train"
)
dataset = dataset.select(range(1000))

# 提取文件
all_documents = [
    Document(content=doc["context"])
    for doc in dataset
]
# 提取問題
all_questions = [
    doc["instruction"]
    for doc in dataset
]
# 提取真實答案
all_ground_truth_answers = [
    doc["response"]
    for doc in dataset
]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from typing import List
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

In [3]:
# 建立索引管道
indexing = Pipeline()

In [4]:
# 建立 `文件嵌入器`
document_embedder = SentenceTransformersDocumentEmbedder(
    model="sentence-transformers/all-MiniLM-L6-v2"
)

# 建立 `內存文件儲存` 對象
document_store = InMemoryDocumentStore()

# 使用儲存對象建立 `文件寫入器`
document_writer = DocumentWriter(
    document_store=document_store,
    # 重複時跳過
    policy=DuplicatePolicy.SKIP
)

In [5]:
# 添加管道元件
indexing.add_component(
    instance=document_embedder,
    name="document_embedder"
)
indexing.add_component(
    instance=document_writer,
    name="document_writer"
)

In [6]:
# 連接管道元件：連接嵌入器和寫入器
indexing.connect(
    "document_embedder.documents",
    "document_writer.documents"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x326956260>
🚅 Components
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - document_embedder.documents -> document_writer.documents (List[Document])

In [7]:
# 執行索引管道
indexing.run(
    {"document_embedder": {"documents": all_documents}}
)

Batches: 100%|██████████| 32/32 [00:06<00:00,  4.64it/s]


{'document_writer': {'documents_written': 1000}}

In [8]:
indexing.draw('ex15-1.png')

In [9]:
import os
from getpass import getpass
from dotenv import load_dotenv

# 載入環境變數
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# 設置 OpenAI API 金鑰
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [10]:
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

In [11]:
# 定義生成答案的模板
template = """
        您必須僅根據給定的上下文資訊回答以下問題。

        上下文:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        問題: {{question}}
        答案:
        """

In [12]:
# 建立 RAG 管道
rag_pipeline = Pipeline()

In [13]:
rag_pipeline.add_component(
    "query_embedder", 
    SentenceTransformersTextEmbedder(
        model="sentence-transformers/all-MiniLM-L6-v2"
    )
)
rag_pipeline.add_component(
    "retriever",
    InMemoryEmbeddingRetriever(document_store, top_k=3)
)
rag_pipeline.add_component(
    "prompt_builder",
    PromptBuilder(template=template)
)
rag_pipeline.add_component(
    "generator",
    OpenAIGenerator(model="gpt-4-turbo")
)
rag_pipeline.add_component(
    "answer_builder",
    AnswerBuilder()
)

In [14]:
# 連接管道的組件
rag_pipeline.connect(
    "query_embedder", "retriever.query_embedding"
)
rag_pipeline.connect(
    "retriever", "prompt_builder.documents"
)
rag_pipeline.connect(
    "prompt_builder", "generator"
)
rag_pipeline.connect(
    "generator.replies", "answer_builder.replies"
)
rag_pipeline.connect(
    "generator.meta", "answer_builder.meta"
)
rag_pipeline.connect(
    "retriever", "answer_builder.documents"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x38db2ded0>
🚅 Components
  - query_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - query_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - retriever.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> generator.prompt (str)
  - generator.replies -> answer_builder.replies (List[str])
  - generator.meta -> answer_builder.meta (List[Dict[str, Any]])

In [17]:
# 問題：小兒肝移植術後早期降鈣素原高是否表示術後效果不佳？
question = "Do high levels of procalcitonin in the early phase after?"

# 運行管道
response = rag_pipeline.run(
    {
        "query_embedder": {"text": question},
        "prompt_builder": {"question": question},
        "answer_builder": {"query": question}
    }
)
# 輸出
print(response["answer_builder"]["answers"][0].data)

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.92it/s]


pediatric liver transplantation correlate with higher morbidity and complications? Yes, high levels of procalcitonin (PCT) on postoperative day 2 are associated with higher International Normalized Ratio values on postoperative day 5, more frequent cases of primary graft non-function, longer stays in the pediatric intensive care unit, and longer durations on mechanical ventilation. This suggests that elevated PCT levels can indeed be an indicator of higher morbidity and complications in the early phase after pediatric liver transplantation.


In [21]:
# 這是原本的代碼，便於識別所以使用註解的方式保留
# rag_pipeline.add_component(
#     "query_embedder", 
#     SentenceTransformersTextEmbedder(
#         model="sentence-transformers/all-MiniLM-L6-v2"
#     )
# )

# 改用支持多語言的嵌入模型
rag_pipeline.add_component(
    # 這是新的嵌入模型名稱
    "multi_language_embedder",
    SentenceTransformersTextEmbedder(
        # 使用新的嵌入模型
        model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )
)

In [24]:
# 連接新組件到現有的管道中
# 移除舊的 query_embedder 連接
rag_pipeline.disconnect("query_embedder", "retriever.query_embedding")
# 新的組件與舊的嵌入器提供相同的功能，且希望保持原有的 retriever 和後續處理不變
rag_pipeline.connect(
    "multi_language_embedder", "retriever.query_embedding"
)

AttributeError: 'Pipeline' object has no attribute 'disconnect'