In [None]:
from haystack.dataclasses import Document
import pandas as pd

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import GPTGenerator

from haystack.components.retrievers.in_memory import InMemoryBM25Retriever


from dotenv import load_dotenv
import os


In [None]:
load_dotenv(".env")
openai_key = os.getenv("OPENAI_KEY")

In [None]:
# Source https://www.kaggle.com/datasets/carrie1/ecommerce-data?resource=download
df = pd.read_csv("data.csv", encoding='latin1')

# Drop rows with empty CustomerID
df.dropna(subset=['CustomerID'], inplace=True)

# rename columns to all lower case
df.columns = [x.lower() for x in df.columns]

# Save df to dict
df_dict = df.to_dict("records")

In [None]:
df['country'].unique()

In [None]:
haystack_documents = []

# Create a list of Haystack documents
for i in range(len(df_dict)):
    content_str = f"Name of item purchased: {df_dict[i]['description']}; \
        Quantity purchased: {df_dict[i]['quantity']}; \
        Price of item: {df_dict[i]['unitprice']}; \
        Date of purchase: {df_dict[i]['invoicedate']}; \
        Country of purchase: {df_dict[i]['country']}; \
        Customer ID: {df_dict[i]['customerid']}; \
        Invoice Number: {df_dict[i]['invoiceno']}; \
        Stock Code: {df_dict[i]['stockcode']};" ,
    haystack_documents.append(Document(
        content=content_str[0],
        id = f"ZOOA{str(1000000 + i)}",
        meta={
            "invoiceno": df_dict[i]["invoiceno"],
            "stockcode": df_dict[i]["stockcode"],
            "description": df_dict[i]["description"],
            "quantity": df_dict[i]["quantity"],
            "invoicedate": df_dict[i]["invoicedate"],
            "unitprice": df_dict[i]["unitprice"],
            "customerid": df_dict[i]["customerid"],
            "country": df_dict[i]["country"],
        },
    ))

In [None]:
haystack_documents[0].content

In [None]:
document_store = InMemoryDocumentStore(bm25_algorithm="BM25Plus")
document_store.write_documents(documents=haystack_documents)

In [None]:
document_store.filter_documents()[0]

In [None]:


######## Complete this section #############
prompt_template = """
You are an expert data analyst who helps customers and employees with their questions about purchases and products.
You use the information provided in the documents to answer the questions.
You can answer the following types of questions:
Questions regarding the order: please ask the user to give you the invoice number.
Questions regarding the product: please ask the user to give you the stock code.
Questions regarding purchases made on a given day: please ask the user to give you the date of purchase.
If you are asked to calculate the total price of a purchase, please ask the user to give you the invoice 
number and add the total price of the items in the purchase. Outline the item name, number of items and cost per unit in your response. 
If you are asked to calculate the total number of items for a purchase, please ask the user to give 
you the invoice number and add the total number of items in the purchase. Outline the item name and number of items in your response.
If the invoiceno starts with C and the total number of items is negative,it means the items were returned. You should indicate this clearly in your answer.
If the documents do not contain the answer to the question, say that ‘Answer is unknown.’
Context:
{% for doc in documents %}
    Purchase information: {{ doc.content }} 
    Invoice Number: {{ doc.meta['invoiceno'] }} 
    Stock Code: {{doc.meta['stockcode']}}
    Quantity purchased: {{doc.meta['quantity']}}
    Date of purchase: {{doc.meta['invoicedate']}}
    Price per item: {{doc.meta['unitprice']}} \n
{% endfor %};
Question: {{query}}
\n Answer:
"""
prompt_builder = PromptBuilder(prompt_template)
retriever = InMemoryBM25Retriever(document_store=document_store)
############################################
llm = GPTGenerator(api_key=openai_key, 
                   generation_kwargs={"temperature": 0},
                   model='gpt-4')

prediction_pipeline = Pipeline()
prediction_pipeline.add_component("retriever", retriever)
prediction_pipeline.add_component("prompt_builder", prompt_builder)
prediction_pipeline.add_component("generator", llm)

prediction_pipeline.connect("retriever.documents", "prompt_builder.documents")
prediction_pipeline.connect("prompt_builder", "generator")


In [None]:
query = "Items for order with invoice number 536365"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
print(result['generator']['replies'][0])

In [None]:
query = "Total number of items purchased for order C536825"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
print(result['generator']['replies'][0])

In [None]:
query = "Total cost for order C536825"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
print(result['generator']['replies'][0])

In [None]:
df.groupby(['invoiceno'])['quantity'].sum()

In [None]:
[item for item in df[df['quantity']<0]['invoiceno'].unique() if "C" in item]

In [None]:
df[df['invoiceno']=='536365']['quantity'].sum()

In [None]:
(df[df['invoiceno']=='536365']['quantity']*df[df['invoiceno']=='536365']['unitprice']).sum()

In [None]:
df[df['stockcode']=='84406B']['invoicedate'].unique()

In [None]:

pipeline = Pipeline()
pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="retriever")

In [None]:
query = "Names of items purchased"
result = pipeline.run(data={"retriever": {"query": query, 
                                          "filters": {  "field": "meta.invoiceno", "operator": "==", "value": "536365"}}})

In [None]:
print(result)

In [None]:
document_store.filter_documents(filters={"meta.invoiceno": ["536365"]})

In [None]:
df[df['InvoiceNo']=='536365']