In [1]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
from haystack import Document

dataset = load_dataset("anakin87/medrag-pubmed-chunk", split="train")

docs = []
for doc in dataset:
    docs.append(
        Document(
            content=doc["contents"],
            meta={
                "title": doc["title"],
                "abstract": doc["content"],
                "pmid": doc["id"]
            }
        )
    )

In [3]:
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack import Pipeline
from haystack.utils import ComponentDevice

# 將文件分割為 512 個詞的塊
document_splitter = DocumentSplitter(
    split_by="word",
    split_length=512,
    split_overlap=32
)

# 建立用於密集檢索的文件嵌入
document_embedder = SentenceTransformersDocumentEmbedder(
    model="BAAI/bge-small-en-v1.5",
    # MacOS 不使用 CUDA
    # device=ComponentDevice.from_str("cuda:0")
    device=ComponentDevice.from_str("cpu")
)

# 將文件寫入文件儲存
document_writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("document_splitter", document_splitter)
indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("document_writer", document_writer)

indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")

indexing_pipeline.run({"document_splitter": {"documents": docs}})

Batches: 100%|██████████| 481/481 [09:14<00:00,  1.15s/it]


{'document_writer': {'documents_written': 15380}}

In [4]:
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(
    model="BAAI/bge-small-en-v1.5",
    # device=ComponentDevice.from_str("cuda:0")
    device=ComponentDevice.from_str("cpu")
)
embedding_retriever = InMemoryEmbeddingRetriever(document_store)
bm25_retriever = InMemoryBM25Retriever(document_store)

In [5]:
from haystack.components.joiners import DocumentJoiner

document_joiner = DocumentJoiner()

In [6]:
from haystack.components.rankers import TransformersSimilarityRanker


ranker = TransformersSimilarityRanker(
    model="BAAI/bge-reranker-base"
)

In [7]:
from haystack import Pipeline

hybrid_retrieval = Pipeline()
hybrid_retrieval.add_component("text_embedder", text_embedder)
hybrid_retrieval.add_component("embedding_retriever", embedding_retriever)
hybrid_retrieval.add_component("bm25_retriever", bm25_retriever)
hybrid_retrieval.add_component("document_joiner", document_joiner)
hybrid_retrieval.add_component("ranker", ranker)

hybrid_retrieval.connect("text_embedder", "embedding_retriever")
hybrid_retrieval.connect("bm25_retriever", "document_joiner")
hybrid_retrieval.connect("embedding_retriever", "document_joiner")
hybrid_retrieval.connect("document_joiner", "ranker")

<haystack.core.pipeline.pipeline.Pipeline object at 0x323ce2f20>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - embedding_retriever: InMemoryEmbeddingRetriever
  - bm25_retriever: InMemoryBM25Retriever
  - document_joiner: DocumentJoiner
  - ranker: TransformersSimilarityRanker
🛤️ Connections
  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])
  - embedding_retriever.documents -> document_joiner.documents (List[Document])
  - bm25_retriever.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> ranker.documents (List[Document])

In [8]:
hybrid_retrieval.draw("hybrid-retrieval.png")

In [10]:
# query = "嬰兒呼吸暫停該怎辦？"
query = "apnea in infants"

result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": query},
        "bm25_retriever": {"query": query},
        "ranker": {"query": query},
    }
)

Batches: 100%|██████████| 1/1 [00:00<00:00, 23.05it/s]


In [11]:
result

{'ranker': {'documents': [Document(id=b1ef81d59ff9e73e9c731aa3dcf3d78df8ade9314b8694470730f0bb0d8ab554, content: 'Physiologic changes induced by theophylline in the treatment of apnea in preterm infants. Ten preter...', meta: {'title': 'Physiologic changes induced by theophylline in the treatment of apnea in preterm infants.', 'abstract': 'Ten preterm infants (birth weight 0.970 to 2.495 kg) with apnea due to periodic breathing (apneic interval = 5 to 10 seconds) or with "serious apnea" (greater than or equal to 20 seconds) were studied before and after the administration of theophylline. We determined the incidence of apnea, respiratory minute volume, alveolar gases, arterial gases and pH, "specific" compliance, functional residual capacity, and work of breathing. Theophylline decreased the incidence of apnea (P less than .05), increased respiratory minute volume (P less than 0.001), decreased (PACO2 (and PaCO2 P less than 0.001), increased the slope of the CO2 response curve (P less 

In [12]:
def pretty_print_results(prediction):
    for doc in prediction["documents"]:
        print(doc.meta["title"], "\t", doc.score)
        print(doc.meta["abstract"])
        print("\n", "\n")
        
pretty_print_results(result["ranker"])

Physiologic changes induced by theophylline in the treatment of apnea in preterm infants. 	 0.9714499115943909
Ten preterm infants (birth weight 0.970 to 2.495 kg) with apnea due to periodic breathing (apneic interval = 5 to 10 seconds) or with "serious apnea" (greater than or equal to 20 seconds) were studied before and after the administration of theophylline. We determined the incidence of apnea, respiratory minute volume, alveolar gases, arterial gases and pH, "specific" compliance, functional residual capacity, and work of breathing. Theophylline decreased the incidence of apnea (P less than .05), increased respiratory minute volume (P less than 0.001), decreased (PACO2 (and PaCO2 P less than 0.001), increased the slope of the CO2 response curve (P less than 0.02) with a significant shift to the left (P less than 0.02). These findings suggest that the decreased incidence of apnea after theophylline is associated with an increase in alveolar ventilation and increased sensitivity to

In [13]:
def pretty_print_results(prediction):
    # 檢查是否有文件被檢索到
    if not prediction["ranker"]["documents"]:
        print("沒有檢索到相關文件。")
        return

    # 美化輸出
    for idx, doc in enumerate(prediction["ranker"]["documents"]):
        print(f"\n文件 {idx + 1}:")
        print(f"標題: {doc.meta.get('title', '無標題')}")
        print(f"PMID: {doc.meta.get('pmid', '無PMID')}")
        print(f"分數: {doc.score:.4f}")
        print(f"摘要: {doc.meta.get('abstract', '無摘要')}")
        print("-" * 80)

# 美化並輸出結果
pretty_print_results(result)


文件 1:
標題: Physiologic changes induced by theophylline in the treatment of apnea in preterm infants.
PMID: PMID:22592
分數: 0.9714
摘要: Ten preterm infants (birth weight 0.970 to 2.495 kg) with apnea due to periodic breathing (apneic interval = 5 to 10 seconds) or with "serious apnea" (greater than or equal to 20 seconds) were studied before and after the administration of theophylline. We determined the incidence of apnea, respiratory minute volume, alveolar gases, arterial gases and pH, "specific" compliance, functional residual capacity, and work of breathing. Theophylline decreased the incidence of apnea (P less than .05), increased respiratory minute volume (P less than 0.001), decreased (PACO2 (and PaCO2 P less than 0.001), increased the slope of the CO2 response curve (P less than 0.02) with a significant shift to the left (P less than 0.02). These findings suggest that the decreased incidence of apnea after theophylline is associated with an increase in alveolar ventilation and in

中文檢索

In [24]:
import jieba

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 對查詢進行分詞
query = "嬰兒呼吸暫停該怎辦？"
tokenized_query = tokenize(query)
# 查看分詞後的問題
print(tokenized_query)

result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 輸出美化版結果
pretty_print_results(result)

嬰兒 呼吸 暫停 該 怎辦 ？


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.85it/s]



文件 1:
標題: The introduction of SI units and the standardisation of laboratory reports: recommendations of the South African Association of Clinical Biochemists.
PMID: PMID:10633
分數: 0.0000
摘要: The change of SI units in laboratory reporting is occurring in South Africa. To assist this process and to ensure uniformity among laboratories, the Recommendations of the South African Association of Clinical Biochemists are presented. Controversial points are discussed and subunits selected. A standardised format of laboratory reports is suggested.
--------------------------------------------------------------------------------

文件 2:
標題: Interference in word associations in schizophrenia.
PMID: PMID:28335
分數: 0.0000
摘要: Assessed the effect of response interference on the word associations of male and female process and reactive schizophrenics in two studies that used the difference in associative disturbances between high and low interference (low and high commonality stimulus words) as the me

使用翻譯庫 `translate` 進行自動翻譯

查看一下當前的內容

In [30]:
import pprint

# 打印檢索結果的詳細內容
pprint.pprint(result)


{'ranker': {'documents': [Document(id=32540393426fd42391de81352bc3048c8908856e46ff05a9824bdf2389ab83aa, content: 'The introduction of SI units and the standardisation of laboratory reports: recommendations of the S...', meta: {'title': 'The introduction of SI units and the standardisation of laboratory reports: recommendations of the South African Association of Clinical Biochemists.', 'abstract': 'The change of SI units in laboratory reporting is occurring in South Africa. To assist this process and to ensure uniformity among laboratories, the Recommendations of the South African Association of Clinical Biochemists are presented. Controversial points are discussed and subunits selected. A standardised format of laboratory reports is suggested.', 'pmid': 'PMID:10633', 'source_id': '76fd863c9d39c6140d8032f3d2227f8097775f8b550f4bbed3477dfcd71f2e25', 'page_number': 1}, score: 3.743577690329403e-05),
                          Document(id=1804a66d982a78796723350acad4573880bc319eadbe0a3b6d41

In [31]:
from translate import Translator
import jieba
import pprint

# 初始化翻譯器
translator = Translator(to_lang="zh")

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 對查詢進行分詞
query = "嬰兒呼吸暫停該怎辦？"
tokenized_query = tokenize(query)

# 查看分詞後的查詢
print(f"分詞後的查詢: {tokenized_query}")

# 進行檢索
result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 檢查結果是否有內容
print("檢索結果:")
pprint.pprint(result)

# 定義翻譯函數
def translate_to_chinese(text):
    if text:  # 檢查文本是否為空
        return translator.translate(text)
    else:
        return "無內容"

# 美化輸出結果並進行翻譯
def pretty_print_and_translate_results(prediction):
    if "ranker" not in prediction or not prediction["ranker"]:
        print("沒有找到相關文件。")
        return

    for idx, doc in enumerate(prediction["ranker"]["documents"], start=1):
        if not doc.content:
            print(f"文件 {idx}: 沒有內容")
            continue

        translated_title = translate_to_chinese(doc.meta['title'])
        translated_abstract = translate_to_chinese(doc.meta['abstract'])
        translated_content = translate_to_chinese(doc.content)

        print(f"文件 {idx}:")
        print(f"標題: {translated_title}")
        print(f"摘要: {translated_abstract}")
        print(f"內容: {translated_content}")
        print("\n" + "-" * 80 + "\n")

# 輸出並翻譯結果
pretty_print_and_translate_results(result)


分詞後的查詢: 嬰兒 呼吸 暫停 該 怎辦 ？


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.81it/s]


檢索結果:
{'ranker': {'documents': [Document(id=32540393426fd42391de81352bc3048c8908856e46ff05a9824bdf2389ab83aa, content: 'The introduction of SI units and the standardisation of laboratory reports: recommendations of the S...', meta: {'title': 'The introduction of SI units and the standardisation of laboratory reports: recommendations of the South African Association of Clinical Biochemists.', 'abstract': 'The change of SI units in laboratory reporting is occurring in South Africa. To assist this process and to ensure uniformity among laboratories, the Recommendations of the South African Association of Clinical Biochemists are presented. Controversial points are discussed and subunits selected. A standardised format of laboratory reports is suggested.', 'pmid': 'PMID:10633', 'source_id': '76fd863c9d39c6140d8032f3d2227f8097775f8b550f4bbed3477dfcd71f2e25', 'page_number': 1}, score: 3.743577690329403e-05),
                          Document(id=1804a66d982a78796723350acad4573880bc319eadbe0a

縮短

In [35]:
from translate import Translator
import jieba

# 初始化翻譯器
translator = Translator(to_lang="zh-tw")

# 定義分詞函數
def tokenize(text):
    return " ".join(jieba.cut(text))

# 定義文本截斷函數
def truncate_text(text, max_length=400):
    if len(text) > max_length:
        return text[:max_length] + "..."
    return text

# 定義翻譯函數
def translate_to_chinese(text):
    return translator.translate(text)

# 對查詢進行分詞
query = "嬰兒呼吸暫停急救措施"
tokenized_query = tokenize(query)

# 查看分詞後的查詢
print(f"分詞後的查詢: {tokenized_query}")

# 進行檢索
result = hybrid_retrieval.run(
    {
        "text_embedder": {"text": tokenized_query},
        "bm25_retriever": {"query": tokenized_query},
        "ranker": {"query": tokenized_query}
    }
)

# 檢查結果是否有內容
print("檢索結果:")
pprint.pprint(result)

# 美化輸出結果並進行翻譯
def pretty_print_and_translate_results(prediction):
    if "ranker" not in prediction or not prediction["ranker"]["documents"]:
        print("沒有找到相關文件。")
        return

    for idx, doc in enumerate(prediction["ranker"]["documents"], start=1):
        truncated_title = truncate_text(doc.meta['title'])
        truncated_abstract = truncate_text(doc.meta['abstract'])
        truncated_content = truncate_text(doc.content)

        translated_title = translate_to_chinese(truncated_title)
        translated_abstract = translate_to_chinese(truncated_abstract)
        translated_content = translate_to_chinese(truncated_content)

        print(f"文件 {idx}:")
        print(f"標題: {translated_title}")
        print(f"摘要: {translated_abstract}")
        print(f"內容: {translated_content}")
        print("\n" + "-" * 80 + "\n")

# 輸出並翻譯結果
pretty_print_and_translate_results(result)


分詞後的查詢: 嬰兒 呼吸 暫停 急救 措施


Batches: 100%|██████████| 1/1 [00:00<00:00, 54.94it/s]


檢索結果:
{'ranker': {'documents': [Document(id=b685698d22b8f26fc85192032362bacd102fc436b4986ee035edc4dfd1466557, content: 'The evaluation of the novel pressor activity of gamma-piperidinobutyramide (WY 20051, DF480). 1 gamm...', meta: {'title': 'The evaluation of the novel pressor activity of gamma-piperidinobutyramide (WY 20051, DF480).', 'abstract': '1 gamma-Piperidinobutyramide (Wy 20051, DF480) injected intravenously evoked pressor responses in the anaesthetized ganglion blocked rat preparation over the dose range 2.4 x 10(-6)-3.0 x 10(-4) mol/kg. 2 High doses (greater than 3.8 x 10(-5) mol/kg) or even repeated submaximal doses (1.9 x 10(-5) mol/kg) of Wy 20051 caused tachyphylaxis of this pressor response. 3 The noradrenaline pressor-response curve was shifted significantly to the right of the control curve following a dose of Wy 20051 (1.5 x 10(-4) mol/kg cumulative). 4 The dose-response curve for the pressor action of Wy 20051 was potentiated in reserpine-treated anaesthetized rats

KeyboardInterrupt: 