In [16]:
from haystack.dataclasses import Document
import pandas as pd

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import GPTGenerator

from dotenv import load_dotenv
import os


In [17]:
load_dotenv(".env")
openai_key = os.getenv("OPENAI_KEY")

In [8]:
# Source https://www.kaggle.com/datasets/carrie1/ecommerce-data?resource=download
df = pd.read_csv("data.csv", encoding='latin1')

# Drop rows with empty CustomerID
df.dropna(subset=['CustomerID'], inplace=True)

In [None]:
df_dict = df.to_dict("records")

In [9]:
haystack_documents = []

# Create a list of Haystack documents
for i in range(len(df_dict)):
    haystack_documents.append(Document(
        content=df_dict[i]["Description"],
        meta={
            "InvoiceNo": df_dict[i]["InvoiceNo"],
            "StockCode": df_dict[i]["StockCode"],
            "Description": df_dict[i]["Description"],
            "Quantity": df_dict[i]["Quantity"],
            "InvoiceDate": df_dict[i]["InvoiceDate"],
            "UnitPrice": df_dict[i]["UnitPrice"],
            "CustomerID": df_dict[i]["CustomerID"],
            "Country": df_dict[i]["Country"],
        },
    ))

In [10]:
document_store = InMemoryDocumentStore()

embedder = SentenceTransformersDocumentEmbedder()
writer = DocumentWriter(document_store=document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("embedder", "writer")

In [12]:
indexing_pipeline.run(data={"embedder": {"documents": haystack_documents[:100]}})


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.00s/it]


{'writer': {'documents_written': 100}}

In [18]:
document_store.filter_documents()[0]

Document(id=7dc70d50b246194a881dfc297dff3064bea3ecafdfc964c45b99336fd75f57ba, content: 'WHITE HANGING HEART T-LIGHT HOLDER', meta: {'InvoiceNo': '536365', 'StockCode': '85123A', 'Description': 'WHITE HANGING HEART T-LIGHT HOLDER', 'Quantity': 6, 'InvoiceDate': '12/1/2010 8:26', 'UnitPrice': 2.55, 'CustomerID': 17850.0, 'Country': 'United Kingdom'}, embedding: vector of size 768)

In [19]:


######## Complete this section #############
prompt_template = """
You are a helpful assistant who helps customers and employees with their questions about purchases.
You use the information provided in the documents to answer the questions.
You can answer the following types of questions:
Questions regarding the order: you require the invoice number.
Questions regarding the product: you require the stock code.
If the documents do not contain the answer to the question, say that ‘Answer is unknown.’
Context:
{% for doc in documents %}
    Document: {{ doc.content }} 
    Invoice Number: {{ doc.meta['InvoiceNo'] }} 
    Stock Code: {{doc.meta['StockCode']}}
    Quantity purchased: {{doc.meta['Quantity']}}
    Date of purchase: {{doc.meta['InvoiceDate']}}
    Price per item: {{doc.meta['UnitPrice']}} \n
{% endfor %};
Question: {{query}}
\n Answer:
"""
prompt_builder = PromptBuilder(prompt_template)
############################################
query_embedder = SentenceTransformersTextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=2)
llm = GPTGenerator(api_key=openai_key)

  instance = super().__call__(*args, **kwargs)


In [None]:
query = "How do I use the openai embedder?"
result = pipeline.run(data={"query_embedder": {"text": query}, "prompt_builder": {"query": query}})
print(result['llm']['replies'][0])