In [1]:
from haystack import Pipeline
from haystack.components.preprocessors import TextCleaner, DocumentCleaner, DocumentSplitter
from haystack.components.converters import TikaDocumentConverter 

In [16]:
# pipeline = Pipeline()
# pipeline.add_component("cleaner", TextCleaner(remove_punctuation=True, convert_to_lowercase=True))
# pipeline.run({"cleaner":{"texts":["Hello World$"]}})
pipeline = Pipeline()
pipeline.add_component("converter", TikaDocumentConverter())
pipeline.add_component("cleaner", DocumentCleaner(remove_regex='[^A-Za-z0-9 !.:/]+',
   remove_empty_lines=True, remove_repeated_substrings=True))
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3, split_overlap=0))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7fdec63fab80>
🚅 Components
  - converter: TikaDocumentConverter
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
🛤️ Connections
  - converter.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> splitter.documents (List[Document])

In [17]:
results = pipeline.run({"converter": {"sources": ["tiktok_policy.html"]}})

In [18]:
[document.content for document in results["splitter"]["documents"]]

['Privacy Policy  TikTok U.S. Privacy Policy Last updated: March 28 2024This Privacy Policy applies to TikTok services the Platform which include TikTok apps websites software and related services accessed via any platform or device that link to this Privacy Policy.',
 ' The Platform is provided and controlled by TikTok Inc. TikTok we or us. We are committed to protecting and respecting your privacy.',
 ' This Privacy Policy explains how we collect use share and otherwise process the personal information of users and other individuals age 13 and over in connection with our Platform. For information about how we collect use share and otherwise process the personal information of users under age 13 Children please refer to our Childrens Privacy Policy. For information about how we collect use share and otherwise process consumer health data as defined under Washingtons My Health My Data Act and other similar state laws please refer to the Consumer Health Data Privacy Policy.',
 'Capitali

In [19]:
from haystack.components.readers import ExtractiveReader
from haystack.utils import Secret
from huggingface_hub import notebook_login
from dotenv import load_dotenv
   
load_dotenv()
# notebook_login()

reader = ExtractiveReader(model="deepset/bert-large-uncased-whole-word-masking-squad2")
reader.warm_up()
""""""
# What user information and identifiers are collected by the service provider, such as their device and SIM identifiers, location, contact details etc.
# prompt = """
# 	List out all user information collected by the service provider, including device IDs, SIM, IP
# 	Addresses, Location data and anything else. List all information recorded from user interactions,
#  	and how user generated content is used for purposes such as advertisement or for training Artifical
#   Intelligence, Machine Learning Models, or Generative AI.
# """

prompt = """
	You are a privacy policy analyzer. Your job is to evaluate the given set of documents and list
	any personally identifiable information that is collected by the third party service, which
	includes details about the user themselves (such as their name and address), or information
	about their device(s) (such as their geographic location, device ID, advertising ID, device
 	settings or configurations, network details such as WiFi SSIDs, Interet Protocol Addresses,
  	MAC Addresses) This detail can also include information collected automatically by the service
   provider about user interactions, such as image and audio information, any type of metadata,
   cookies, and tracking across other websites and applications.
"""
user_data_results = reader.run(query=prompt, documents=results["splitter"]["documents"])

Some weights of the model checkpoint at deepset/bert-large-uncased-whole-word-masking-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# user_data_answers = [(answer.document.content) for answer in user_data_results["answers"] if answer.document]
# user_data_answers
user_data_results

{'answers': [ExtractedAnswer(query='\n\tYou are a privacy policy analyzer. Your job is to evaluate the given set of documents and list\n\tany personally identifiable information that is collected by the third party service, which\n\tincludes details about the user themselves (such as their name and address), or information\n\tabout their device(s) (such as their geographic location, device ID, advertising ID, device\n \tsettings or configurations, network details such as WiFi SSIDs, Interet Protocol Addresses,\n  \tMAC Addresses) This detail can also include information collected automatically by the service\n   provider about user interactions, such as image and audio information, any type of metadata,\n   cookies, and tracking across other websites and applications.\n', score=0.5468508005142212, data='Privacy Policy  Legal  Slack', document=Document(id=ef86a622a45a94cfcba4be2292c9c4b7a9c3d6266b87c6b8748fa6d63f08d728, content: 'Privacy Policy  Legal  Slack Skip to main content Feature