In [2]:
from haystack import Pipeline
from haystack.components.preprocessors import TextCleaner, DocumentCleaner, DocumentSplitter
from haystack.components.converters import TikaDocumentConverter 

In [3]:
# pipeline = Pipeline()
# pipeline.add_component("cleaner", TextCleaner(remove_punctuation=True, convert_to_lowercase=True))
# pipeline.run({"cleaner":{"texts":["Hello World$"]}})
pipeline = Pipeline()
pipeline.add_component("converter", TikaDocumentConverter())
pipeline.add_component("cleaner", DocumentCleaner(remove_regex='[^A-Za-z0-9 !.:/]+',
   remove_empty_lines=True, remove_repeated_substrings=True))
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=5))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7b3d012790>
🚅 Components
  - converter: TikaDocumentConverter
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
🛤️ Connections
  - converter.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> splitter.documents (List[Document])

In [4]:
results = pipeline.run({"converter": {"sources": ["slack_policy.html"]}})

In [5]:
[document.content for document in results["splitter"]["documents"]]

['Privacy Policy  Legal  Slack Skip to main content Features COLLABORATION Channels Organize teams and work Slack Connect Work with external partners Messaging Chat with your team Huddles Meet with audio and video Clips Record and share updates AUTOMATION Workflow Builder Automate everyday tasks Apps  Integrations Bring your tools to Slack KNOWLEDGE Search Seek shared knowledge Canvas Create rich flexible docs File Sharing Bring files to the flow of work Lists Organize track and manage projects INTELLIGENCE Slack AI Save time and work smarter with powerfully simple AI ENTERPRISE PLATFORM Security Protect data ensure compliance Enterprise Key Management Monitor and revoke access Slack Atlas Discover rich profiles and org charts Watch Demo Download Slack FEATURED Businesses of all sizes are working faster and smarter with Slack AILearn more What is Slack Slack vs. Email Accessibility Solutions BY DEPARTMENT Engineering IT Customer Service Sales Project Management Marketing Human Resource

In [6]:
from haystack.components.readers import ExtractiveReader
from huggingface_hub import notebook_login
from dotenv import load_dotenv
   
load_dotenv()
# notebook_login()

reader = ExtractiveReader(model="meta-llama/Meta-Llama-3-8B")
reader.warm_up()
""""""
# What user information and identifiers are collected by the service provider, such as their device and SIM identifiers, location, contact details etc.
user_data_results = reader.run(query="List out all user information collected by the service provider, including device IDs, SIM, IP Addresses, Location data and anything else. List all information recorded from user interactions, and how user generated content is used for purposes such as advertisement or for training Artifical Intelligence, Machine Learning Models, or Generative AI.", documents=results["splitter"]["documents"])

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   7%|7         | 367M/4.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
user_data_answers = [(answer.document.content) for answer in user_data_results["answers"] if answer.document]
user_data_answers