# 30 Pinecone and Haystack with splitter
Translate to Pinecone vector db...

https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline

??Working??

In [1]:
import os
from pathlib import Path
from getpass import getpass

from haystack.document_stores.in_memory import InMemoryDocumentStore

In [2]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
# from haystack.document_stores.in_memory import InMemoryDocumentStore

In [3]:
# !pip install markdown-it-py mdit_plain pypdf

In [4]:
from haystack import Document
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore

In [4]:
os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face API key:")

In [5]:
pak = "PINECONE_API_KEY"
if pak in os.environ:
    print("Found PINECONE_API_KEY in os.environ, no need to prompt")
else: 
    os.environ["PINECONE_API_KEY"]= getpass("Enter PINECONE API key:")

#os.environ["PINECONE_API_KEY"]

In [6]:
# Make sure you have the PINECONE_API_KEY environment variable set
document_store = PineconeDocumentStore(
		environment="gcp-starter",
		index="myidx",
		namespace="myns",
		dimension=5
)

document_store.write_documents([
    Document(content="This is first", embedding=[0.2]*5), 
    Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5])
    ])

Upserted vectors:   0%|          | 0/2 [00:00<?, ?it/s]

2

In [7]:
print(document_store.count_documents())

2


In [8]:
document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

In [9]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [10]:
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store)

In [11]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

In [12]:
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f2f8efba920>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Path])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Path])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Path])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> document_cleaner.documents (List[D

In [14]:
from pathlib import Path
# data_dir = "/opt/data/text"
# preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(data_dir).glob("**/*"))}})
data_dir = "/opt/data/samples"
preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(data_dir).glob("**/*"))}})

Could not read /opt/data/samples/COMPREHENSIVE_ANNUAL_FINANCIAL_REPORT_CAFR_2017-18.pdf and convert it to Document, skipping. cryptography>=3.1 is required for AES algorithm


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 258}}

In [15]:
import os
from getpass import getpass

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")

In [17]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceTGIGenerator, HuggingFaceLocalGenerator

template = """
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=template))

generator = HuggingFaceLocalGenerator(model="google/flan-t5-large",
                                      task="text2text-generation",
                                      generation_kwargs={
                                        "max_new_tokens": 100,
                                        "temperature": 0.9,
                                        })
# pipe.add_component("llm", HuggingFaceTGIGenerator("mistralai/Mistral-7B-Instruct-v0.1"))
pipe.add_component("llm", generator)

pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f420731bc70>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: HuggingFaceLocalGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [23]:
question = (
    # "What topics are discussed in jacksonville politics?"
    # "What are major tax spending items in jacksonville?"
    "What are major topics of concern for charter schools?"
)

foo = pipe.run(
    {
        "embedder": {"text": question},
        "prompt_builder": {"question": question},
        "llm": {"generation_kwargs": {"max_new_tokens": 350}},
    }
)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
print(f"Foo: {foo}")

Foo: {'llm': {'replies': ['Lack of access to affordable facilities']}}
