In [15]:
from haystack.dataclasses import Document
import pandas as pd

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import GPTGenerator

from haystack.components.retrievers.in_memory import InMemoryBM25Retriever


from dotenv import load_dotenv
import os


In [16]:
load_dotenv(".env")
openai_key = os.getenv("OPENAI_KEY")

In [17]:
# Source https://www.kaggle.com/datasets/carrie1/ecommerce-data?resource=download
df = pd.read_csv("data.csv", encoding='latin1')

# Drop rows with empty CustomerID
df.dropna(subset=['CustomerID'], inplace=True)

# rename columns to all lower case
df.columns = [x.lower() for x in df.columns]

# Save df to dict
df_dict = df.to_dict("records")

In [18]:
haystack_documents = []

# Create a list of Haystack documents
for i in range(len(df_dict)):
    content_str = f"Name of item purchased: {df_dict[i]['description']}; \
        Quantity purchased: {df_dict[i]['quantity']}; \
        Price of item: {df_dict[i]['unitprice']}; \
        Date of purchase: {df_dict[i]['invoicedate']}; \
        Country of purchase: {df_dict[i]['country']}; \
        Customer ID: {df_dict[i]['customerid']}; \
        Invoice Number: {df_dict[i]['invoiceno']}; \
        Stock Code: {df_dict[i]['stockcode']};" ,
    haystack_documents.append(Document(
        content=content_str[0],
        id = f"ZOOA{str(1000000 + i)}",
        meta={
            "invoiceno": df_dict[i]["invoiceno"],
            "stockcode": df_dict[i]["stockcode"],
            "description": df_dict[i]["description"],
            "quantity": df_dict[i]["quantity"],
            "invoicedate": df_dict[i]["invoicedate"],
            "unitprice": df_dict[i]["unitprice"],
            "customerid": df_dict[i]["customerid"],
            "country": df_dict[i]["country"],
        },
    ))

In [19]:
haystack_documents[0].id

'ZOOA1000000'

In [20]:
document_store = InMemoryDocumentStore(bm25_algorithm="BM25Plus")
document_store.write_documents(documents=haystack_documents)

406829

In [21]:
document_store.filter_documents()[0]

Document(id=ZOOA1000000, content: 'Name of item purchased: WHITE HANGING HEART T-LIGHT HOLDER;         Quantity purchased: 6;         P...', meta: {'invoiceno': '536365', 'stockcode': '85123A', 'description': 'WHITE HANGING HEART T-LIGHT HOLDER', 'quantity': 6, 'invoicedate': '12/1/2010 8:26', 'unitprice': 2.55, 'customerid': 17850.0, 'country': 'United Kingdom'})

In [22]:


######## Complete this section #############
prompt_template = """
You are an expert data analyst who helps customers and employees with their questions about purchases and products.
You use the information provided in the documents to answer the questions.
You can answer the following types of questions:
Questions regarding the order: please ask the user to give you the invoice number.
Questions regarding the product: please ask the user to give you the stock code.
Questions regarding purchases made on a given day: please ask the user to give you the date of purchase.
If you are asked to calculate the total price of a purchase, please ask the user to give you the invoice 
number and add the total price of the items in the purchase. Outline the item name, number of items and cost per unit in your response. 
If you are asked to calculate the total number of items for a purchase, please ask the user to give 
you the invoice number and add the total number of items in the purchase. Outline the item name and number of items in your response.
If the invoiceno starts with C and the total number of items is negative,it means the items were returned. You should indicate this clearly in your answer.
If the documents do not contain the answer to the question, say that ‘Answer is unknown.’
Context:
{% for doc in documents %}
    Purchase information: {{ doc.content }} 
    Invoice Number: {{ doc.meta['invoiceno'] }} 
    Stock Code: {{doc.meta['stockcode']}}
    Quantity purchased: {{doc.meta['quantity']}}
    Date of purchase: {{doc.meta['invoicedate']}}
    Price per item: {{doc.meta['unitprice']}} \n
{% endfor %};
Question: {{query}}
\n Answer:
"""
prompt_builder = PromptBuilder(prompt_template)
retriever = InMemoryBM25Retriever(document_store=document_store)
############################################
llm = GPTGenerator(api_key=openai_key, 
                   generation_kwargs={"temperature": 0},
                   model='gpt-4')

prediction_pipeline = Pipeline()
prediction_pipeline.add_component("retriever", retriever)
prediction_pipeline.add_component("prompt_builder", prompt_builder)
prediction_pipeline.add_component("generator", llm)

prediction_pipeline.connect("retriever.documents", "prompt_builder.documents")
prediction_pipeline.connect("prompt_builder", "generator")


  instance = super().__call__(*args, **kwargs)


In [30]:

query = "Items for order with invoice number 536365"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
final_answer= result['generator']['replies'][0]

Ranking by BM25...: 100%|██████████| 406829/406829 [00:22<00:00, 18199.55 docs/s]


## Test its ability to answer correctly consistently

In [None]:
def get_items_from_answer(answer):
    items = []
    for line in answer.strip().split("\n"):
        if "Item" in line:
            item_name = line.split(":")[1].strip().split(", ")[0]
            quantity = line.split(":")[2].strip()
            items.append((item_name, quantity))
    return items

def test_answers_against_ground_truth(test_df, items):
    
    fail_flags = []
    for item, quantity in items:
        if not test_df[(test_df['description'] == item) & (test_df['quantity'] == int(quantity))].empty:
            pass
        else:
            fail_flags.append((item, quantity))

    if len(fail_flags) == 0:
        print("All items found!")
    else:
        print("The following items were not found:")
        print(fail_flags)
    
    return fail_flags

def item_quantity_test(order_number):
    query = f"Items for order with invoice number {order_number}"
    result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                        "prompt_builder": {"query": query},
                                        })
    final_answer= result['generator']['replies'][0]

    return get_items_from_answer(final_answer)

results = []
for i in range(100):
    items = item_quantity_test("536365")
    test_df = df[df['invoiceno'] == "536365"]

    test_result = test_answers_against_ground_truth(test_df, items)
    results.append(test_result)


In [None]:
query = "Total number of items purchased for order C536825"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
print(result['generator']['replies'][0])

In [None]:
query = "Total cost for order C536825"
result = prediction_pipeline.run(data={"retriever": {"query": query}, 
                                       "prompt_builder": {"query": query},
                                       })
print(result['generator']['replies'][0])

In [61]:
df[df['invoiceno'].str.contains("C")]['invoiceno'].unique()

array(['C536379', 'C536383', 'C536391', ..., 'C581499', 'C581568',
       'C581569'], dtype=object)

In [None]:
[item for item in df[df['quantity']<0]['invoiceno'].unique() if "C" in item]

In [None]:
df[df['invoiceno']=='536365']['quantity'].sum()

In [None]:
(df[df['invoiceno']=='536365']['quantity']*df[df['invoiceno']=='536365']['unitprice']).sum()

In [11]:
df[df['stockcode']=='84406B']['invoicedate'].unique()

array(['12/1/2010 8:26', '12/1/2010 9:02', '12/1/2010 9:32',
       '12/1/2010 10:51', '12/1/2010 11:33', '12/2/2010 9:41',
       '12/2/2010 9:44', '12/2/2010 10:54', '12/2/2010 10:56',
       '12/2/2010 11:41', '12/2/2010 12:22', '12/2/2010 12:23',
       '12/2/2010 12:25', '12/2/2010 14:04', '12/2/2010 14:06',
       '12/2/2010 15:24', '12/2/2010 15:26', '12/2/2010 17:41',
       '12/5/2010 13:05', '12/5/2010 13:08', '12/5/2010 13:18',
       '12/5/2010 13:49', '12/5/2010 13:55', '12/5/2010 16:24',
       '12/6/2010 11:26', '12/6/2010 14:36', '12/6/2010 15:27',
       '12/7/2010 13:29', '12/8/2010 10:35', '12/8/2010 13:03',
       '12/8/2010 16:15', '12/9/2010 13:34', '12/9/2010 16:53',
       '12/12/2010 13:26', '12/12/2010 16:14', '12/14/2010 11:48',
       '12/14/2010 14:40', '12/15/2010 9:58', '12/16/2010 18:20',
       '12/19/2010 13:31', '12/20/2010 11:56', '1/4/2011 12:58',
       '1/6/2011 15:25', '1/6/2011 16:03', '1/6/2011 18:57',
       '1/7/2011 13:17', '1/7/2011 16:32',

In [None]:

pipeline = Pipeline()
pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="retriever")

In [None]:
query = "Names of items purchased"
result = pipeline.run(data={"retriever": {"query": query, 
                                          "filters": {  "field": "meta.invoiceno", "operator": "==", "value": "536365"}}})

In [None]:
print(result)

In [None]:
document_store.filter_documents(filters={"meta.invoiceno": ["536365"]})

In [None]:
df[df['InvoiceNo']=='536365']