In [1]:
import tqdm as notebook_tqdm
from haystack.utils import Secret

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

In [3]:
PINECONE_API_KEY

'25729d53-b428-4f21-83b9-e81d2605b047'

In [5]:
from haystack import Document
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
document_store = PineconeDocumentStore(
	#environment="gcp-starter",
	index="default",
	namespace="default",
	dimension=768
)

In [6]:
document_store

<haystack_integrations.document_stores.pinecone.document_store.PineconeDocumentStore at 0x7852c88562e0>

In [7]:
from haystack.components.converters import PyPDFToDocument
from pathlib import Path # type: ignore
converter = PyPDFToDocument()
docs = converter.run(sources=[Path(r"/home/sourabh/END_TO_END_RAG_APPLICATION_USING_HEYSTACK/data/Retrieval-Augmented_Generation-for-knowledg-graph.pdf")])

In [8]:
docs

{'documents': [Document(id=2f4a29ed8bf31ff58961f77b64072ca5d257cb9dbfeaacd872a01dfbb746070e, content: 'Retrieval-Augmented Generation for
  Knowledge-Intensive NLP Tasks
  Patrick Lewis†‡, Ethan Perez⋆,
  Alek...', meta: {'file_path': '/home/sourabh/END_TO_END_RAG_APPLICATION_USING_HEYSTACK/data/Retrieval-Augmented_Generation-for-knowledg-graph.pdf'})]}

In [9]:
print(docs['documents'][0].content)

Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research;‡University College London;⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extractive downstream tas

In [10]:
print(docs['documents'][0].embedding)

None


In [11]:
'''document_store.write_documents([
    Document(content="This is first", embedding=[0.0]*5), 
    Document(content="This is second",embedding=[0.1, 0.2, 0.3, 0.4, 0.5])
    ])'''


'document_store.write_documents([\n    Document(content="This is first", embedding=[0.0]*5), \n    Document(content="This is second",embedding=[0.1, 0.2, 0.3, 0.4, 0.5])\n    ])'

In [12]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder


In [13]:
SentenceTransformersDocumentEmbedder()


<haystack.components.embedders.sentence_transformers_document_embedder.SentenceTransformersDocumentEmbedder object at 0x7852c80d5bb0>
Inputs:
  - documents: List[Document]
Outputs:
  - documents: List[Document]

In [14]:
indexing = Pipeline()


In [15]:
indexing.add_component("converter", PyPDFToDocument())
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
indexing.add_component("embedder", SentenceTransformersDocumentEmbedder())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "splitter")
indexing.connect("splitter", "embedder")
indexing.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7852b2a80f10>
🚅 Components
  - converter: PyPDFToDocument
  - splitter: DocumentSplitter
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - converter.documents -> splitter.documents (List[Document])
  - splitter.documents -> embedder.documents (List[Document])
  - embedder.documents -> writer.documents (List[Document])

In [16]:
indexing.run({"converter": {"sources": ["/home/sourabh/END_TO_END_RAG_APPLICATION_USING_HEYSTACK/data/Retrieval-Augmented_Generation-for-knowledg-graph.pdf"]}})

Batches: 100%|██████████| 17/17 [00:35<00:00,  2.11s/it]
Upserted vectors: 100%|██████████| 528/528 [00:18<00:00, 28.54it/s]


{'writer': {'documents_written': 528}}

In [17]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.builders import PromptBuilder
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack.components.generators import HuggingFaceTGIGenerator

In [42]:
'''import os
from dotenv import load_dotenv
load_dotenv()
#from haystack import Document

# Make sure you have the PINECONE_API_KEY environment variable set
document_store = PineconeDocumentStore(metric="cosine", dimension=768, index="YOUR_INDEX_NAME", environment = "YOUR_ENVIRONMENT")
'''

'import os\nfrom dotenv import load_dotenv\nload_dotenv()\n#from haystack import Document\n\n# Make sure you have the PINECONE_API_KEY environment variable set\ndocument_store = PineconeDocumentStore(metric="cosine", dimension=768, index="YOUR_INDEX_NAME", environment = "YOUR_ENVIRONMENT")\n'

In [43]:
prompt_template = """Answer the following query based on the provided context. If the context does
                     not include an answer, reply with 'I don't know'.\n
                     Query: {{query}}
                     Documents:
                     {% for doc in documents %}
                        {{ doc.content }}
                     {% endfor %}
                     Answer: 
                  """

In [44]:
query_pipeline = Pipeline()

In [45]:
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store))
query_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template))


query_pipeline.add_component("llm", HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=Secret.from_token("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")))

ValueError: Model mistralai/Mistral-7B-v0.1 not found on HuggingFace Hub. Please provide a valid HuggingFace model_id.

In [46]:
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever.documents", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "llm")

ValueError: Component named llm not found in the pipeline.

In [47]:
query = "What is RAG-Token?"

In [36]:
results = query_pipeline.run(
    {
        "text_embedder": {"text": query},
        "prompt_builder": {"query": query},
    }
)


ValueError: Missing input for component text1_embedder: text

In [39]:
results

NameError: name 'results' is not defined

In [41]:
print(results['llm']['replies'][0])

NameError: name 'results' is not defined