In [33]:
# 導入所需的模組
# 用於建立和管理 Haystack 管道
from haystack import Document, Pipeline
# 該類用於建立內存中的文件儲存，方便快速讀取和寫入數據
from haystack.document_stores.in_memory import InMemoryDocumentStore
# 這是一個文件語言分類器，用於檢測文件的語言
from haystack.components.classifiers import DocumentLanguageClassifier
# 用於根據文件的元數據（例如語言）將文件路由到不同的處理節點
from haystack.components.routers import MetadataRouter
# 用於將文件寫入指定的文件儲存中
from haystack.components.writers import DocumentWriter

In [34]:
# 人工編輯各種語言的酒店評論樣本
documents = [
    Document(content="Super appartement. Juste au dessus de plusieurs bars qui ferment très tard. A savoir à l'avance. (Bouchons d'oreilles fournis !)"),
    Document(content="El apartamento estaba genial y muy céntrico, todo a mano. Al lado de la librería Lello y De la Torre de los clérigos. Está situado en una zona de marcha, así que si vais en fin de semana , habrá ruido, aunque a nosotros no nos molestaba para dormir"),
    Document(content="The keypad with a code is convenient and the location is convenient. Basically everything else, very noisy, wi-fi didn't work, check-in person didn't explain anything about facilities, shower head was broken, there's no cleaning and everything else one may need is charged."),
    Document(content="It is very central and appartement has a nice appearance (even though a lot IKEA stuff), *W A R N I N G the appartement presents itself as a elegant and as a place to relax, very wrong place to relax - you cannot sleep in this appartement, even the beds are vibrating from the bass of the clubs in the same building - you get ear plugs from the hotel -> now I understand why -> I missed a trip as it was so loud and I could not hear the alarm next day due to the ear plugs.- there is a green light indicating 'emergency exit' just above the bed, which shines very bright at night - during the arrival process, you felt the urge of the agent to leave as soon as possible. - try to go to 'RVA clerigos appartements' -> same price, super quiet, beautiful, city center and very nice staff (not an agency)- you are basically sleeping next to the fridge, which makes a lot of noise, when the compressor is running -> had to switch it off - but then had no cool food and drinks. - the bed was somehow broken down - the wooden part behind the bed was almost falling appart and some hooks were broken before- when the neighbour room is cooking you hear the fan very loud. I initially thought that I somehow activated the kitchen fan"),
    Document(content="Un peu salé surtout le sol. Manque de service et de souplesse"),
    Document(content="Nous avons passé un séjour formidable. Merci aux personnes , le bonjours à Ricardo notre taxi man, très sympathique. Je pense refaire un séjour parmi vous, après le confinement, tout était parfait, surtout leur gentillesse, aucune chaude négative. Je n'ai rien à redire de négative, Ils étaient a notre écoute, un gentil message tout les matins, pour nous demander si nous avions besoins de renseignement et savoir si tout allait bien pendant notre séjour."),
    Document(content="Céntrico. Muy cómodo para moverse y ver Oporto. Edificio con terraza propia en la última planta. Todo reformado y nuevo. Te traen un estupendo desayuno todas las mañanas al apartamento. Solo que se puede escuchar algo de ruido de la calle a primeras horas de la noche. Es un zona de ocio nocturno. Pero respetan los horarios.")
]

In [35]:
# 建立各語言的內存文件儲存：英語、法語、西班牙語
en_document_store = InMemoryDocumentStore()
fr_document_store = InMemoryDocumentStore()
es_document_store = InMemoryDocumentStore()

# 建立語言分類器
language_classifier = DocumentLanguageClassifier(
    languages=["en", "fr", "es"]
)

In [36]:
# 建立元數據路由器，根據語言將文件路由到對應的寫入器
router_rules = {
    "en": {"language": {"$eq": "en"}},
    "fr": {"language": {"$eq": "fr"}},
    "es": {"language": {"$eq": "es"}}
}
# 將路由設定指定給 MetadataRouter 對象
router = MetadataRouter(rules=router_rules)

In [37]:
# 輸出看一下
print(router)

<haystack.components.routers.metadata_router.MetadataRouter object at 0x32055ac50>
Inputs:
  - documents: List[Document]
Outputs:
  - unmatched: List[Document]
  - en: List[Document]
  - fr: List[Document]
  - es: List[Document]


In [38]:
# 建立語言專用的寫入器
en_writer = DocumentWriter(document_store=en_document_store)
fr_writer = DocumentWriter(document_store=fr_document_store)
es_writer = DocumentWriter(document_store=es_document_store)

In [39]:
# 建立管道
indexing_pipeline = Pipeline()
# 添加組件
indexing_pipeline.add_component(
    instance=language_classifier, name="language_classifier"
)
indexing_pipeline.add_component(
    instance=router, name="router"
)
indexing_pipeline.add_component(
    instance=en_writer, name="en_writer"
)
indexing_pipeline.add_component(
    instance=fr_writer, name="fr_writer"
)
indexing_pipeline.add_component(
    instance=es_writer, name="es_writer"
)

# 連接組件
indexing_pipeline.connect("language_classifier", "router")
indexing_pipeline.connect("router.en", "en_writer")
indexing_pipeline.connect("router.fr", "fr_writer")
indexing_pipeline.connect("router.es", "es_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x32040fc10>
🚅 Components
  - language_classifier: DocumentLanguageClassifier
  - router: MetadataRouter
  - en_writer: DocumentWriter
  - fr_writer: DocumentWriter
  - es_writer: DocumentWriter
🛤️ Connections
  - language_classifier.documents -> router.documents (List[Document])
  - router.en -> en_writer.documents (List[Document])
  - router.fr -> fr_writer.documents (List[Document])
  - router.es -> es_writer.documents (List[Document])

In [40]:
# 繪製管道圖
indexing_pipeline.draw("indexing_pipeline.png")

In [41]:
# 運行管道，顯示寫入每個語言的文件數
indexing_pipeline.run(
    data={"language_classifier": {"documents": documents}}
)

{'router': {'unmatched': []},
 'en_writer': {'documents_written': 2},
 'fr_writer': {'documents_written': 3},
 'es_writer': {'documents_written': 2}}

In [42]:
print(
    "English documents: ",
    en_document_store.filter_documents()
)
print(
    "French documents: ",
    fr_document_store.filter_documents()
)
print(
    "Spanish documents: ",
    es_document_store.filter_documents()
)

English documents:  [Document(id=8f64ab234c6a5d5652d02bed144d069ec6e988903b071d16fffbf400abfc1047, content: 'The keypad with a code is convenient and the location is convenient. Basically everything else, very...', meta: {'language': 'en'}), Document(id=1e65b29e0cdbf3263ccb2fa6807288c40eb48b37f04fd37e131e2759c9a017cd, content: 'It is very central and appartement has a nice appearance (even though a lot IKEA stuff), *W A R N I ...', meta: {'language': 'en'})]
French documents:  [Document(id=ea7ea338874232de2d8105a258813f50345db82772e21ad2c4549dbb7adce8a3, content: 'Super appartement. Juste au dessus de plusieurs bars qui ferment très tard. A savoir à l'avance. (Bo...', meta: {'language': 'fr'}), Document(id=6b64c8a60543ee32b81cd39bc8d6e09fae4bff1b22c6ccdcf414db26fa354e7a, content: 'Un peu salé surtout le sol. Manque de service et de souplesse', meta: {'language': 'fr'}), Document(id=b1be23526f19a8af80a190e775bfd05e65878e585529037cb45b47267a4eaa98, content: 'Nous avons passé un séjour fo

In [43]:
from getpass import getpass
import os
from dotenv import load_dotenv

# 導入環境變數
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# 判斷是否寫入，若無則手動提供
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [44]:
# 導入 RAG 管道所需的組件
# 用來在內存中執行基於 BM25 算法的文本檢索
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
# 將多個文件進行合併，為後續處理提供一個統一的文件集合
from haystack.components.joiners import DocumentJoiner
# 用來根據模板建立提示，將查詢和相關文件整理成一個統一的提示文本
from haystack.components.builders import PromptBuilder
# 用來調用 OpenAI 的 API 生成文本
from haystack.components.generators import OpenAIGenerator
# 用來根據文本的語言將查詢或文件路由到不同的處理路徑
from haystack.components.routers import TextLanguageRouter

In [45]:
# 定義提示模板
prompt_template = """
您將收到有關住宿的評論。
僅根據給定的評論簡潔地回答問題。
評論：
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}
問題：{{ query}}
答案：
"""

In [46]:
# 建立 RAG 管道
rag_pipeline = Pipeline()

# TextLanguageRouter 用於檢測輸入文本的語言，並將文本路由到對應的處理路徑
rag_pipeline.add_component(
    # 元件參數是可以傳入值的，指定支持英文、法文、西班牙文
    instance=TextLanguageRouter(["en", "fr", "es"]),
    # 組件名稱為 "router"
    name="router"
)

# InMemoryBM25Retriever 用於在內存中基於 BM25 算法進行英文文件檢索
rag_pipeline.add_component(
    # 指定英文文件儲存
    instance=InMemoryBM25Retriever(document_store=en_document_store),
    name="en_retriever"
)

rag_pipeline.add_component(
    # 指定法文文件儲存
    instance=InMemoryBM25Retriever(document_store=fr_document_store),
    name="fr_retriever"
)

rag_pipeline.add_component(
    # 指定西班牙文文件儲存
    instance=InMemoryBM25Retriever(document_store=es_document_store),
    name="es_retriever"
)

# DocumentJoiner 用於合併多個文件，提供一個統一的文件集合
rag_pipeline.add_component(
    # 建立 DocumentJoiner 實例
    instance=DocumentJoiner(),
    name="joiner" 
)

# PromptBuilder 用於根據模板建立提示文本，將查詢和相關文件整理成一個提示文本
rag_pipeline.add_component(
    # 指定提示模板
    instance=PromptBuilder(template=prompt_template),
    name="prompt_builder"
)

# OpenAIGenerator 用於調用 OpenAI 的 API 生成回答文本
rag_pipeline.add_component(
    instance=OpenAIGenerator(),
    name="llm" 
)

In [47]:
# 連接組件
rag_pipeline.connect("router.en", "en_retriever.query")
rag_pipeline.connect("router.fr", "fr_retriever.query")
rag_pipeline.connect("router.es", "es_retriever.query")
rag_pipeline.connect("en_retriever", "joiner")
rag_pipeline.connect("fr_retriever", "joiner")
rag_pipeline.connect("es_retriever", "joiner")
rag_pipeline.connect("joiner.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x32055abc0>
🚅 Components
  - router: TextLanguageRouter
  - en_retriever: InMemoryBM25Retriever
  - fr_retriever: InMemoryBM25Retriever
  - es_retriever: InMemoryBM25Retriever
  - joiner: DocumentJoiner
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - router.en -> en_retriever.query (str)
  - router.fr -> fr_retriever.query (str)
  - router.es -> es_retriever.query (str)
  - en_retriever.documents -> joiner.documents (List[Document])
  - fr_retriever.documents -> joiner.documents (List[Document])
  - es_retriever.documents -> joiner.documents (List[Document])
  - joiner.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [49]:
# 繪製管道圖
rag_pipeline.draw("rag_pipeline.png")

In [50]:
# 測試英文查詢
en_question = "Is this apartment conveniently located?"
result = rag_pipeline.run({
    "router": {"text": en_question},
    "prompt_builder": {"query": en_question}
})
print(result["llm"]["replies"][0])

Yes, the apartment is conveniently located.


In [51]:
# 測試西班牙語查詢
es_question = "¿El desayuno es genial?"
result = rag_pipeline.run({
    "router": {"text": es_question},
    "prompt_builder": {"query": es_question}
})
print(result["llm"]["replies"][0])

Sí, el desayuno es estupendo.


In [52]:
# 測試中文查詢
zh_question = "這間公寓位置方便嗎？"
result = rag_pipeline.run({
    "router": {"text": zh_question},
    "prompt_builder": {"query": zh_question}
})
print(result["llm"]["replies"][0])

是的，這間公寓位置非常方便。
