In [1]:
from datetime import datetime
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever

# 建立文件
documents = [
    Document(
        content="Use pip to install a basic version of Haystack's latest release: pip install farm-haystack. All the core Haystack components live in the haystack repo. But there's also the haystack-extras repo which contains components that are not as widely used, and you need to install them separately.",
        meta={"version": 1.15, "date": datetime(2023, 3, 30)},
    ),
    Document(
        content="Use pip to install a basic version of Haystack's latest release: pip install farm-haystack[inference]. All the core Haystack components live in the haystack repo. But there's also the haystack-extras repo which contains components that are not as widely used, and you need to install them separately.",
        meta={"version": 1.22, "date": datetime(2023, 11, 7)},
    ),
    Document(
        content="Use pip to install only the Haystack 2.0 code: pip install haystack-ai. The haystack-ai package is built on the main branch which is an unstable beta version, but it's useful if you want to try the new features as soon as they are merged.",
        meta={"version": 2.0, "date": datetime(2023, 12, 4)},
    ),
]

# 初始化內存文件儲存
document_store = InMemoryDocumentStore(bm25_algorithm="BM25Plus")
# 將文件寫入文件儲存
document_store.write_documents(documents=documents)

  from .autonotebook import tqdm as notebook_tqdm


3

這樣會出錯

In [2]:
# 將內存文件儲存寫入文件儲存
docs = document_store.write_documents(
    documents=documents
)
# 查看回傳值
print(docs)

DuplicateDocumentError: ID '3d3b2afa171bee3bbff4a94baaec239f9d28bba333114a08ad6d0b684710a3be' already exists.

查看有哪些屬性可用

In [3]:
# 列出所有可用的方法和屬性
print(dir(document_store))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_avg_doc_len', '_bm25_attr', '_compute_query_embedding_similarity_scores', '_dispatch_bm25', '_freq_vocab_for_idf', '_score_bm25l', '_score_bm25okapi', '_score_bm25plus', '_tokenize_bm25', 'bm25_algorithm', 'bm25_algorithm_inst', 'bm25_parameters', 'bm25_retrieval', 'bm25_tokenization_regex', 'count_documents', 'delete_documents', 'embedding_retrieval', 'embedding_similarity_function', 'filter_documents', 'from_dict', 'index', 'storage', 'to_dict', 'tokenizer', 'write_documents']


In [4]:
# 查看文件的總數
document_count = document_store.count_documents()
print(f"儲存的文件數量：{document_count}")


儲存的文件數量：3


In [None]:
from haystack import Pipeline

# 初始化管道
pipeline = Pipeline()
# 添加內存 BM25 檢索器到管道
pipeline.add_component(
    instance=InMemoryBM25Retriever(document_store=document_store),
    name="retriever"
)

In [None]:
# 提問並進行元數據過濾
query = "Haystack installation"
result = pipeline.run(
    data={
        "retriever": {
            "query": query,
            "filters": {
                "field": "meta.version",
                "operator": ">", "value": 1.21
            }
        }
    }
)

# 輸出結果
print(result)

In [None]:
# 提問並進行複合條件過濾
query = "Haystack installation"
result = pipeline.run(
    data={
        "retriever": {
            "query": query,
            "filters": {
                "operator": "AND",
                "conditions": [
                    {"field": "meta.version", "operator": ">", "value": 1.21},
                    {"field": "meta.date", "operator": ">", "value": datetime(2023, 11, 7)},
                ],
            },
        }
    }
)

# 輸出結果
print(result)