# 預處理

下載所有文件

In [1]:
# 導入庫
import gdown
# 下載網址
url = "https://drive.google.com/drive/folders/1n9yqq5Gl_HWfND5bTlrCwAOycMDt5EMj"
# 儲存的資料夾
output_dir = "recipe_files"
# 以靜默方式下載到指定資料夾中
gdown.download_folder(
    url,
    quiet=True,
    output=output_dir
)

['recipe_files/vegan_flan_recipe.md',
 'recipe_files/vegan_keto_eggplant_recipe.pdf',
 'recipe_files/vegan_sunflower_hemp_cheese_recipe.txt']

In [2]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(
    mime_types=[
        "text/plain",
        "application/pdf",
        "text/markdown"
    ]
)
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

  from .autonotebook import tqdm as notebook_tqdm


數據清洗

In [3]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(
    split_by="word",
    split_length=150,
    split_overlap=50
)

In [4]:
document_embedder = SentenceTransformersDocumentEmbedder(
    model="sentence-transformers/all-MiniLM-L6-v2"
)
document_writer = DocumentWriter(document_store)

In [5]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(
    instance=file_type_router, name="file_type_router"
)
preprocessing_pipeline.add_component(
    instance=text_file_converter, name="text_file_converter"
)
preprocessing_pipeline.add_component(
    instance=markdown_converter, name="markdown_converter"
)
preprocessing_pipeline.add_component(
    instance=pdf_converter, name="pypdf_converter"
)
preprocessing_pipeline.add_component(
    instance=document_joiner, name="document_joiner"
)
preprocessing_pipeline.add_component(
    instance=document_cleaner, name="document_cleaner"
)
preprocessing_pipeline.add_component(
    instance=document_splitter, name="document_splitter"
)
preprocessing_pipeline.add_component(
    instance=document_embedder, name="document_embedder"
)
preprocessing_pipeline.add_component(
    instance=document_writer, name="document_writer"
)

In [6]:
preprocessing_pipeline.connect(
    "file_type_router.text/plain", "text_file_converter.sources"
)
preprocessing_pipeline.connect(
    "file_type_router.application/pdf", "pypdf_converter.sources"
)
preprocessing_pipeline.connect(
    "file_type_router.text/markdown", "markdown_converter.sources"
)
preprocessing_pipeline.connect(
    "text_file_converter", "document_joiner"
)
preprocessing_pipeline.connect(
    "pypdf_converter", "document_joiner"
)
preprocessing_pipeline.connect(
    "markdown_converter", "document_joiner"
)
preprocessing_pipeline.connect(
    "document_joiner", "document_cleaner"
)
preprocessing_pipeline.connect(
    "document_cleaner", "document_splitter"
)
preprocessing_pipeline.connect(
    "document_splitter", "document_embedder"
)
preprocessing_pipeline.connect(
    "document_embedder", "document_writer"
)

<haystack.core.pipeline.pipeline.Pipeline object at 0x30b4d7910>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Path])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Path])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Path])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> document_cleaner.documents (List[Docu

### 以上完成預處理程序

## 以下是進行 RAG 測試

In [7]:
from pathlib import Path

# 測試文件路徑是否存在
file_path = Path("recipe_files/recipe.txt")
if file_path.exists():
    print(f"正在處理文件: {file_path}")

    try:
        # 設置日誌級別以查看處理過程
        import logging
        logging.basicConfig(level=logging.INFO)
        
        preprocessing_pipeline.run(
            {"file_type_router": {"sources": [file_path]}}
        )
    except Exception as e:
        print(f"處理過程中出現錯誤: {e}")
else:
    print(f"文件 {file_path} 不存在。")


INFO:haystack.core.pipeline.base:Warming up component document_embedder...
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


正在處理文件: recipe_files/recipe.txt


INFO:haystack.core.pipeline.pipeline:Running component file_type_router
INFO:haystack.core.pipeline.pipeline:Running component text_file_converter
INFO:haystack.core.pipeline.pipeline:Running component document_joiner
INFO:haystack.components.joiners.document_joiner:Some of the Documents DocumentJoiner got have score=None. It was configured to sort Documents by score, so those with score=None were sorted as if they had a score of -infinity.
INFO:haystack.core.pipeline.pipeline:Running component document_cleaner
INFO:haystack.core.pipeline.pipeline:Running component document_splitter
INFO:haystack.core.pipeline.pipeline:Running component document_embedder
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
INFO:haystack.core.pipeline.pipeline:Running component document_writer


In [8]:
import os
from getpass import getpass
from dotenv import load_dotenv

load_dotenv()

os.environ["HF_API_TOKEN"] = os.getenv("HF_API_TOKEN")

if "HF_API_TOKEN" not in os.environ:
    print("Hugging Face token not found in environment variables.")
    os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")
else:
    print("Hugging Face token found in environment variables.")

Hugging Face token found in environment variables.


In [9]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator

template = """
根據給定的上下文回答問題。

上下文:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

問題: {{ question }}
回答:
"""
pipe = Pipeline()
# 添加
pipe.add_component(
    "embedder",
    SentenceTransformersTextEmbedder(
        model="sentence-transformers/all-MiniLM-L6-v2"
    )
)
# 添加
pipe.add_component(
    "retriever",
    InMemoryEmbeddingRetriever(document_store=document_store)
)
# 添加
pipe.add_component(
    "prompt_builder", PromptBuilder(template=template)
)
# 添加
pipe.add_component(
    "llm",
    HuggingFaceAPIGenerator(
        api_type="serverless_inference_api",
        api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}),
)
# 連線
pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x105e07d90>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: HuggingFaceAPIGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [10]:
question = (
    "製作純素酮茄子烤寬麵條、純素柿子餡餅和純素大麻起司需要哪些原料？"
)
result = pipe.run(
    {
        "embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_new_tokens": 350}},
    }
)
result

INFO:haystack.core.pipeline.base:Warming up component embedder...
INFO:haystack.core.pipeline.pipeline:Running component embedder
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
INFO:haystack.core.pipeline.pipeline:Running component retriever
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm


{'llm': {'replies': [' \n\n要製作純素酮茄子烤寬麵條、純素柿子餡餅和純素大麻起司，你需要下列原料：\n\n1. 純素酮茄子烤寬麵條：\n   - 純素茄子\n   - 純素寬麵\n   - 純素酱油\n   - 純素盐\n   - 純素甜味子 (可選)\n\n2. 純素柿子餡餅：\n   - 純素柿子\n   - 純素麵粉\n   - 純素酱油\n   - 純素盐\n   - 純素甜味子 (可選)\n\n3. 純素大麻起司：\n   - 純素大麻\n   - 純素酱油\n   - 純素盐\n   - 純素水\n   - 純素甜味子 (可選)\n\n注意： 這些食識是純素的，所以請確保所有的原料都是純素的'],
  'meta': [{'model': 'HuggingFaceH4/zephyr-7b-beta',
    'finish_reason': 'length',
    'usage': {'completion_tokens': 350}}]}}

逐行輸出生成的文本，並去除多餘的空白字元

In [11]:
# 取得生成的文本
response = result["llm"]["replies"][0]

# 格式化文本並輸出
print("生成的原料列表：\n")
lines = response.split("\n")
for line in lines:
    print(line.strip())

生成的原料列表：



要製作純素酮茄子烤寬麵條、純素柿子餡餅和純素大麻起司，你需要下列原料：

1. 純素酮茄子烤寬麵條：
- 純素茄子
- 純素寬麵
- 純素酱油
- 純素盐
- 純素甜味子 (可選)

2. 純素柿子餡餅：
- 純素柿子
- 純素麵粉
- 純素酱油
- 純素盐
- 純素甜味子 (可選)

3. 純素大麻起司：
- 純素大麻
- 純素酱油
- 純素盐
- 純素水
- 純素甜味子 (可選)

注意： 這些食識是純素的，所以請確保所有的原料都是純素的
