In [1]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
from haystack import Document

dataset = load_dataset(
    "anakin87/medrag-pubmed-chunk",
    split="train"
)

docs = []
for doc in dataset:
    docs.append(
        Document(
            content=doc["contents"],
            meta={
                "title": doc["title"],
                "abstract": doc["content"],
                "pmid": doc["id"]
            }
        )
    )

In [3]:
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack import Pipeline
from haystack.utils import ComponentDevice

# 將文件分割為 512 個詞的塊
document_splitter = DocumentSplitter(
    split_by="word",
    split_length=512,
    split_overlap=32
)

# 建立用於密集檢索的文件嵌入
document_embedder = SentenceTransformersDocumentEmbedder(
    model="BAAI/bge-small-en-v1.5",
    # MacOS 不使用 CUDA
    # device=ComponentDevice.from_str("cuda:0")
    device=ComponentDevice.from_str("cpu")
)

# 將文件寫入文件儲存
document_writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("document_splitter", document_splitter)
indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("document_writer", document_writer)

indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")

indexing_pipeline.run({"document_splitter": {"documents": docs}})

Batches: 100%|██████████| 481/481 [15:37<00:00,  1.95s/it]


{'document_writer': {'documents_written': 15380}}

In [4]:
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(
    model="BAAI/bge-small-en-v1.5",
    # device=ComponentDevice.from_str("cuda:0")
    device=ComponentDevice.from_str("cpu")
)
embedding_retriever = InMemoryEmbeddingRetriever(document_store)
bm25_retriever = InMemoryBM25Retriever(document_store)

In [5]:
from haystack.components.joiners import DocumentJoiner

document_joiner = DocumentJoiner()

In [1]:
from haystack.components.rankers import TransformersSimilarityRanker


ranker = TransformersSimilarityRanker(
    model="BAAI/bge-reranker-base"
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from haystack import Pipeline

hybrid_retrieval = Pipeline()
hybrid_retrieval.add_component("text_embedder", text_embedder)
hybrid_retrieval.add_component("embedding_retriever", embedding_retriever)
hybrid_retrieval.add_component("bm25_retriever", bm25_retriever)
hybrid_retrieval.add_component("document_joiner", document_joiner)
hybrid_retrieval.add_component("ranker", ranker)

hybrid_retrieval.connect("text_embedder", "embedding_retriever")
hybrid_retrieval.connect("bm25_retriever", "document_joiner")
hybrid_retrieval.connect("embedding_retriever", "document_joiner")
hybrid_retrieval.connect("document_joiner", "ranker")

In [None]:
hybrid_retrieval.draw("hybrid-retrieval.png")

In [None]:
# query = "嬰兒呼吸暫停該怎辦？"
query = "apnea in infants"

result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": query},
        "bm25_retriever": {"query": query},
        "ranker": {"query": query},
    }
)

In [None]:
result

In [None]:
def pretty_print_results(prediction):
    for doc in prediction["documents"]:
        print(doc.meta["title"], "\t", doc.score)
        print(doc.meta["abstract"])
        print("\n", "\n")
        
pretty_print_results(result["ranker"])

In [None]:
def pretty_print_results(prediction):
    # 檢查是否有文件被檢索到
    if not prediction["ranker"]["documents"]:
        print("沒有檢索到相關文件。")
        return

    # 美化輸出
    for idx, doc in enumerate(prediction["ranker"]["documents"]):
        print(f"\n文件 {idx + 1}:")
        print(f"標題: {doc.meta.get('title', '無標題')}")
        print(f"PMID: {doc.meta.get('pmid', '無PMID')}")
        print(f"分數: {doc.score:.4f}")
        print(f"摘要: {doc.meta.get('abstract', '無摘要')}")
        print("-" * 80)

# 美化並輸出結果
pretty_print_results(result)

中文檢索

In [None]:
import jieba

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 對查詢進行分詞
query = "嬰兒呼吸暫停該怎辦？"
tokenized_query = tokenize(query)
# 查看分詞後的問題
print(tokenized_query)

result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 輸出美化版結果
pretty_print_results(result)

使用翻譯庫 `translate` 進行自動翻譯

查看一下當前的內容

In [None]:
import pprint

# 打印檢索結果的詳細內容
pprint.pprint(result)


In [None]:
from translate import Translator
import jieba
import pprint

# 初始化翻譯器
translator = Translator(to_lang="zh")

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 對查詢進行分詞
query = "嬰兒呼吸暫停該怎辦？"
tokenized_query = tokenize(query)

# 查看分詞後的查詢
print(f"分詞後的查詢: {tokenized_query}")

# 進行檢索
result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 檢查結果是否有內容
print("檢索結果:")
pprint.pprint(result)

# 定義翻譯函數
def translate_to_chinese(text):
    if text:  # 檢查文本是否為空
        return translator.translate(text)
    else:
        return "無內容"

# 美化輸出結果並進行翻譯
def pretty_print_and_translate_results(prediction):
    if "ranker" not in prediction or not prediction["ranker"]:
        print("沒有找到相關文件。")
        return

    for idx, doc in enumerate(prediction["ranker"]["documents"], start=1):
        if not doc.content:
            print(f"文件 {idx}: 沒有內容")
            continue

        translated_title = translate_to_chinese(doc.meta['title'])
        translated_abstract = translate_to_chinese(doc.meta['abstract'])
        translated_content = translate_to_chinese(doc.content)

        print(f"文件 {idx}:")
        print(f"標題: {translated_title}")
        print(f"摘要: {translated_abstract}")
        print(f"內容: {translated_content}")
        print("\n" + "-" * 80 + "\n")

# 輸出並翻譯結果
pretty_print_and_translate_results(result)


縮短

In [None]:
from translate import Translator
import jieba

# 初始化翻譯器
translator = Translator(to_lang="zh-tw")

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 定義文本截斷函數
def truncate_text(text, max_length=400):
    if len(text) > max_length:
        return text[:max_length] + "..."
    return text

# 定義翻譯函數
def translate_to_chinese(text):
    return translator.translate(text)

# 對查詢進行分詞
query = "嬰兒呼吸暫停急救措施"
tokenized_query = tokenize(query)

# 查看分詞後的查詢
print(f"分詞後的查詢: {tokenized_query}")

# 進行檢索
result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 檢查結果是否有內容
print("檢索結果:")
pprint.pprint(result)

# 美化輸出結果並進行翻譯
def pretty_print_and_translate_results(prediction):
    if "ranker" not in prediction or not prediction["ranker"]["documents"]:
        print("沒有找到相關文件。")
        return

    for idx, doc in enumerate(prediction["ranker"]["documents"], start=1):
        truncated_title = truncate_text(doc.meta['title'])
        truncated_abstract = truncate_text(doc.meta['abstract'])
        truncated_content = truncate_text(doc.content)

        translated_title = translate_to_chinese(truncated_title)
        translated_abstract = translate_to_chinese(truncated_abstract)
        translated_content = translate_to_chinese(truncated_content)

        print(f"文件 {idx}:")
        print(f"標題: {translated_title}")
        print(f"摘要: {translated_abstract}")
        print(f"內容: {translated_content}")
        print("\n" + "-" * 80 + "\n")

# 輸出並翻譯結果
pretty_print_and_translate_results(result)
