In [1]:
# 載入數據集
from datasets import load_dataset
# 導入相關庫
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加載 "古代世界七大奇蹟" 數據集
dataset = load_dataset(
    "bilgeyucel/seven-wonders",
    split="train"
)

In [4]:
print(dataset[:2])

{'id': ['b3de1a673c1eb2876585405395a10c3d', '5dcd01886fcb24322578ceb49c96cc3e'], 'content': ['The Colossus of Rhodes (Ancient Greek: ὁ Κολοσσὸς Ῥόδιος, romanized:\xa0ho Kolossòs Rhódios Greek: Κολοσσός της Ρόδου, romanized:\xa0Kolossós tes Rhódou)[a] was a statue of the Greek sun-god Helios, erected in the city of Rhodes, on the Greek island of the same name, by Chares of Lindos in 280\xa0BC. One of the Seven Wonders of the Ancient World, it was constructed to celebrate the successful defence of Rhodes city against an attack by Demetrius Poliorcetes, who had besieged it for a year with a large army and navy.\nAccording to most contemporary descriptions, the Colossus stood approximately 70 cubits, or 33 metres (108 feet) high – approximately the height of the modern Statue of Liberty from feet to crown – making it the tallest statue in the ancient world.[2] It collapsed during the earthquake of 226 BC, although parts of it were preserved. In accordance with a certain oracle, the Rhodian

In [5]:
# 查看緩存目錄
print(dataset.cache_files)


[{'filename': '/Users/samhsiao/.cache/huggingface/datasets/bilgeyucel___seven-wonders/default/0.0.0/fb6a760df211962001d69fda7f3b42568ca938f8/seven-wonders-train.arrow'}]


In [6]:
# 將數據集轉換為 Haystack 文件格式
documents = [
    Document(content=doc["content"], meta=doc["meta"])
    for doc in dataset
]

# 定義嵌入模型
model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

# 初始化內存文件儲存
document_store = InMemoryDocumentStore()

In [8]:
# 建立索引管道
indexing_pipeline = Pipeline()

# 添加嵌入組件到管道
indexing_pipeline.add_component(
    instance=SentenceTransformersDocumentEmbedder(model=model),
    name="embedder"
)
# 添加文件寫入組件到管道
indexing_pipeline.add_component(
    instance=DocumentWriter(document_store=document_store),
    name="writer"
)

# 連接嵌入組件和文件寫入組件
indexing_pipeline.connect(
    "embedder.documents",
    "writer.documents"
)

# 執行索引管道
indexing_pipeline.run({"documents": documents})

Batches: 100%|██████████| 5/5 [00:06<00:00,  1.38s/it]


{'writer': {'documents_written': 151}}

In [9]:
# 從 Haystack 模組中導入需要的組件
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder

# 初始化檢索器
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

# 初始化讀取器
reader = ExtractiveReader()
# 預熱讀取器
reader.warm_up()

# 建立抽取式問答管道
extractive_qa_pipeline = Pipeline()

# 添加嵌入組件到管道
extractive_qa_pipeline.add_component(
    instance=SentenceTransformersTextEmbedder(model=model),
    name="embedder"
)
# 添加檢索組件到管道
extractive_qa_pipeline.add_component(
    instance=retriever,
    name="retriever"
)
# 添加讀取組件到管道
extractive_qa_pipeline.add_component(
    instance=reader,
    name="reader"
)

# 連接嵌入組件和檢索組件
extractive_qa_pipeline.connect(
    "embedder.embedding",
    "retriever.query_embedding"
)
# 連接檢索組件和讀取組件
extractive_qa_pipeline.connect(
    "retriever.documents",
    "reader.documents"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x16a6bbbe0>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [10]:
# 定義查詢
query = "Who was Pliny the Elder?"

# 執行抽取式問答管道，並返回答案
result = extractive_qa_pipeline.run(
    data={
        "embedder": {"text": query},
        "retriever": {"top_k": 3},
        "reader": {
            "query": query,
            "top_k": 2
        }
    }
)
print(result)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.84it/s]


{'reader': {'answers': [ExtractedAnswer(query='Who was Pliny the Elder?', score=0.8306005597114563, data='Roman writer', document=Document(id=bb2c5f3d2e2e2bf28d599c7b686ab47ba10fbc13c07279e612d8632af81e5d71, content: 'The Roman writer Pliny the Elder, writing in the first century AD, argued that the Great Pyramid had...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_Giza', '_split_id': 16}, score: 21.66772941840059), context=None, document_offset=ExtractedAnswer.Span(start=4, end=16), context_offset=None, meta={}), ExtractedAnswer(query='Who was Pliny the Elder?', score=0.7280887365341187, data='a Roman author', document=Document(id=8910f21f7c0e97792473bcc60a8dcc7f6a90586dbb46b7bf96d28dbfcdc313f4, content: '[21]
Pliny the Elder (AD 23/24 – 79) was a Roman author, a naturalist and natural philosopher, a nav...', meta: {'url': 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', '_split_id': 8}, score: 26.857539924645973), context=None, document_offset=ExtractedAnswer.Spa

In [11]:
# 優化輸出答案格式
def format_extracted_answers(answers):
    formatted_answers = []
    for answer in answers:
        data = answer.data if answer.data else "No answer provided."
        document_content = answer.document.content if answer.document else "No document found."
        document_url = answer.document.meta['url'] if answer.document and 'url' in answer.document.meta else "No URL available."
        score = answer.score
        
        try:
            start = int(answer.document_offset.start) if answer.document_offset else None
            end = int(answer.document_offset.end) if answer.document_offset else None
            excerpt = document_content[start:end] if start is not None and end is not None else "No excerpt available."
        except (ValueError, TypeError):
            excerpt = "Invalid indices for excerpt."
        
        formatted_answer = f"""
        Query: {answer.query}
        Answer: {data}
        Score: {score:.4f}
        Document Excerpt: {excerpt}
        Document URL: {document_url}
        Context (Start-End): {start}-{end}
        """
        formatted_answers.append(formatted_answer)
    return "\n".join(formatted_answers)

# 檢視結果
answers = result["reader"]["answers"]
print("提取的答案：\n", format_extracted_answers(answers))

提取的答案：
 
        Query: Who was Pliny the Elder?
        Answer: Roman writer
        Score: 0.8306
        Document Excerpt: Roman writer
        Document URL: https://en.wikipedia.org/wiki/Great_Pyramid_of_Giza
        Context (Start-End): 4-16
        

        Query: Who was Pliny the Elder?
        Answer: a Roman author
        Score: 0.7281
        Document Excerpt: a Roman author
        Document URL: https://en.wikipedia.org/wiki/Colossus_of_Rhodes
        Context (Start-End): 41-55
        

        Query: Who was Pliny the Elder?
        Answer: No answer provided.
        Score: 0.0461
        Document Excerpt: No excerpt available.
        Document URL: No URL available.
        Context (Start-End): None-None
        
