# 30_file_type_preprocessing_index_pipeline
In Memory 

https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline

Working

In [1]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

In [2]:
# !pip install markdown-it-py mdit_plain pypdf

In [3]:
document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

In [4]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [5]:
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store)

In [6]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

In [7]:
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f434c367d90>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Path])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Path])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Path])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> document_cleaner.documents (List[D

In [8]:
from pathlib import Path
data_dir = "/opt/data/samples"
preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(data_dir).glob("**/*"))}})

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Could not read /opt/data/jax/Duval Charter schools/17601_2018_1.pdf and convert it to Document, skipping. cryptography>=3.1 is required for AES algorithm
Could not read /opt/data/jax/Duval Charter schools/COMPREHENSIVE_ANNUAL_FINANCIAL_REPORT_CAFR_2017-18.pdf and convert it to Document, skipping. cryptography>=3.1 is required for AES algorithm
Ignoring wrong pointing object 45 0 (offset 0)
Could not read /opt/data/jax/Duval Charter schools/19032_2018_1.pdf and convert it to Document, skipping. cryptography>=3.1 is required for AES algorithm


Batches:   0%|          | 0/205 [00:00<?, ?it/s]

{'file_type_router': {'unclassified': [PosixPath('/opt/data/jax/Coggins'),
   PosixPath('/opt/data/jax/Beaches dem club'),
   PosixPath('/opt/data/jax/Duval Charter schools'),
   PosixPath('/opt/data/jax/Coggins/Baynard.docx'),
   PosixPath('/opt/data/jax/Coggins/Blue Ecosystem2.xmind'),
   PosixPath('/opt/data/jax/Coggins/LohBaynard.docx'),
   PosixPath('/opt/data/jax/Coggins/Albert Loh ChungPing.docx'),
   PosixPath('/opt/data/jax/Coggins/ACSDT5Y2020.B27010_2022-08-09T082301.zip'),
   PosixPath('/opt/data/jax/Beaches dem club/9bc497_a28e8e03f6ae443080e2f1484d244f6e.docx'),
   PosixPath('/opt/data/jax/Beaches dem club/9bc497_8b744d71e02c41008335e247a94d30fc.docx'),
   PosixPath('/opt/data/jax/Beaches dem club/FL Dems.docx'),
   PosixPath('/opt/data/jax/Beaches dem club/BDC content.docx'),
   PosixPath('/opt/data/jax/Beaches dem club/9bc497_4dfd6216b2514148bac61a539f492da2.docx'),
   PosixPath('/opt/data/jax/Beaches dem club/CAFR'),
   PosixPath('/opt/data/jax/Beaches dem club/9bc497_8

In [10]:
import os
from getpass import getpass

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")

In [11]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceTGIGenerator

template = """
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component("llm", HuggingFaceTGIGenerator("mistralai/Mistral-7B-Instruct-v0.1"))

pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f434c3a8d90>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: HuggingFaceTGIGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [12]:
question = (
    # "What topics are discussed in jacksonville politics?"
    "What are some Action Based Groups in jacksonville?"
)

pipe.run(
    {
        "embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_new_tokens": 350}},
    }
)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'llm': {'replies': [' 1. Office of Sports and Entertainment\n2. Military Affairs, Veterans & Disabled Svcs\n3. Downtown Investment Authority\n4. Jacksonville Housing & Finance Auth.\n5. Office of Ethics, Compliance & Oversight\n6. Central Operations\n7. Environmental and Compliance\n8. Recreation and Community Services\n9. Information Technolog y\n10. Human Rights Commission\n11. Jacksonville Economic Dev. Commission\n12. Housing and Neighborhoods\n13. Administration and Finance\n14. Parks, Rec., Enter., and Conservation\n15. Community Services\n16. Procurement and Supply\n17. Agriculture\n18. Judicial\n19. Commerce\n20. Transportation\n21. Aviation Port\n22. Business-Type\n23. Commission\n24. Units\n25. JEA Authority\n26. Authority\n27. ACTIVITIES\n28. MAJOR JACKSONVILLE JACKSONVILLE JACKSONVILLE AND DEVELOPMENT COMPONENT TRANSPORTATION AVIATION PORT BUSINESS-TYPE COMMISSION UNITS JEA AUTHORITY AUTHORITY ACTIVITIES\n29. $ - $ - $ - $ - $ - $ - $ (63,289)\n30. - - - - - (502) (6,487) 