In [1]:
!pip install rank-bm25 llama-index llama_hub transformers accelerate sentence_transformers



In [2]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from llama_index.llms import OpenAI
from tqdm.asyncio import tqdm
from llama_index import VectorStoreIndex, ServiceContext, PromptTemplate, Document
from llama_index.retrievers import BM25Retriever
from llama_index.response.notebook_utils import display_source_node
import nest_asyncio


nest_asyncio.apply()

In [3]:
df = pd.read_excel('dataset_grocery.xlsx')
id_col = []
for i in range(len(df)):
  id_col.append(np.random.randint(1e7, 1e8))

df['ID'] = id_col
df['lower_description'] = df['Description'].str.lower()
df.head()

Unnamed: 0,Description,Subcategory,Category,ID,lower_description
0,"Marketside Fresh Vegetable Medley Blend, 12 oz",Vegetable,Produce,75027076,"marketside fresh vegetable medley blend, 12 oz"
1,"Fresh Green Bell Pepper, Each",Vegetable,Produce,15186230,"fresh green bell pepper, each"
2,"Fresh Cilantro, Bunch",Vegetable,Produce,26810050,"fresh cilantro, bunch"
3,"Fresh Roma Tomato, Each",Vegetable,Produce,80385241,"fresh roma tomato, each"
4,"Fresh Jalapeno Pepper, Approx. 3-5 per 0.25 Pound",Vegetable,Produce,75384478,"fresh jalapeno pepper, approx. 3-5 per 0.25 pound"


In [4]:
#create vector db

docs = []
for i, row in df.iterrows():
    docs.append(Document(text = row['lower_description'],
                         doc_id = row['ID'],
                         extra_info = {'category' : row['Category'],
                                     'subcategory':row['Subcategory']}))
print(len(docs))

106


In [5]:
docs[0]

Document(id_='75027076', embedding=None, metadata={'category': 'Produce', 'subcategory': 'Vegetable'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='b41473d67b6462a55f50eb3bb69aa471ea9a87918c8884a0c38adee9fd62bea6', text='marketside fresh vegetable medley blend, 12 oz', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [6]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import ServiceContext

embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en")

In [7]:
service_context = ServiceContext.from_defaults(chunk_size = 512, embed_model = embed_model, llm = None)

LLM is explicitly disabled. Using MockLLM.


In [8]:
index  = VectorStoreIndex.from_documents(docs, service_context=service_context)

In [9]:
async def run_queries(queries, retrievers):
  tasks = []
  for query in queries:
    for i, retriever in enumerate(retrievers):
      tasks.append(retriever.aretrieve(query))

  task_results = await tqdm.gather(*tasks)
  results_dict = {}

  for i, (query, query_result) in enumerate(zip(queries, task_results)):
    results_dict[(query, i)] = query_result

  return results_dict

In [10]:
vector_retriever = index.as_retriever(similarity_top_k = 3)
bm25_retriever = BM25Retriever.from_defaults(docstore = index.docstore, similarity_top_k = 3)

In [11]:
def fuse_results(results_dict, similarity_top_k: int = 3):
    """Fuse results."""
    k = 60.0  # `k` is a parameter used to control the impact of outlier rankings.
    fused_scores = {}
    text_to_node = {}

    # compute reciprocal rank scores
    for nodes_with_scores in results_dict.values():
        for rank, node_with_score in enumerate(
            sorted(
                nodes_with_scores, key=lambda x: x.score or 0.0, reverse=True
            )
        ):
            text = node_with_score.node.get_content()
            text_to_node[text] = node_with_score
            if text not in fused_scores:
                fused_scores[text] = 0.0
            fused_scores[text] += 1.0 / (rank + k)

    # sort results
    reranked_results = dict(
        sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    )

    # adjust node scores
    reranked_nodes: List[NodeWithScore] = []
    for text, score in reranked_results.items():
        reranked_nodes.append(text_to_node[text])
        reranked_nodes[-1].score = score

    return reranked_nodes[:similarity_top_k]

In [26]:
# queries = ['Great Value Large White Eggs, 60 Count']
# queries = ['Boneless Pork Loin Country Style Ribs, 2.3 - 3.8 lb Tray']
# queries = ['Tree Hut Coconut Lime Shea Sugar Exfoliating and Hydrating Body Scrub, 18 oz.']
# queries = ['Wet Ones Antibacterial Fresh Scent Hand Wipes 40 Ct Canister, Hypoallergenic, Kills Germs, Leaves Hands Feeling Clean']
# queries = ['Nestle Nesquik Chocolate Lowfat Milk, Ready to Drink, 14 fl oz']
# queries = ['Tuscan Dairy Farms Whole Milk, Vitamin D Milk Quart - 1 Milk Bottle']
# queries = ['Fresh Green Kale, 1 lb Bag'] #does work best
# queries = ['Organic Marketside Fresh Kale Blend, Rainbow, Green, Red and Lacinato Kale, 1 lb Bag']
# queries = ['Great Value Organic Ground Cardamom, 1.8 oz']
queries = ['Tostitos Scoops! Tortilla Chips Party Size, 14.5 oz Bag']
# queries = ['spaghetti']

In [27]:
results_dict = await run_queries(queries, [vector_retriever, bm25_retriever])

100%|██████████| 2/2 [00:00<00:00,  9.34it/s]


In [28]:
final_results = fuse_results(results_dict)
print(queries[0])
for n in final_results:
    display_source_node(n, source_length=500)

Tostitos Scoops! Tortilla Chips Party Size, 14.5 oz Bag


**Node ID:** 0d3cc520-7e11-4b30-818f-5ca354f87cb7<br>**Similarity:** 0.016666666666666666<br>**Text:** sabra snackers fresh guacamole dip and rolled tortilla chips, 2.8 oz, 1 pack<br>

**Node ID:** a12767f6-9e64-42fc-a301-d53fc52c0bb3<br>**Similarity:** 0.01639344262295082<br>**Text:** transocean seafood snackers, snack size leg style imitation crab, 1 - 3 oz small plastic bag<br>

**Node ID:** 6cd5a25d-b7c9-4764-9fca-4a78ef4f6432<br>**Similarity:** 0.016129032258064516<br>**Text:** wholly guacamole chunky bowl 15 oz<br>

In [32]:
# queries = ['Great Value Large White Eggs, 60 Count']
# queries = ['Boneless Pork Loin Country Style Ribs, 2.3 - 3.8 lb Tray']
# queries = ['Tree Hut Coconut Lime Shea Sugar Exfoliating and Hydrating Body Scrub, 18 oz.']
# queries = ['Wet Ones Antibacterial Fresh Scent Hand Wipes 40 Ct Canister, Hypoallergenic, Kills Germs, Leaves Hands Feeling Clean']
queries = ['Nestle Nesquik Chocolate Lowfat Milk, Ready to Drink, 14 fl oz']
# queries = ['Tuscan Dairy Farms Whole Milk, Vitamin D Milk Quart - 1 Milk Bottle']
# queries = ['Fresh Green Kale, 1 lb Bag'] #does work best
# queries = ['Organic Marketside Fresh Kale Blend, Rainbow, Green, Red and Lacinato Kale, 1 lb Bag']
# queries = ['Great Value Organic Ground Cardamom, 1.8 oz']
# queries = ['Tostitos Scoops! Tortilla Chips Party Size, 14.5 oz Bag']
# queries = ['spaghetti']
results_dict = await run_queries(queries, [vector_retriever, bm25_retriever])
final_results = fuse_results(results_dict)
print(queries[0])
for n in final_results:
    display_source_node(n, source_length=500)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  2.22it/s][A
100%|██████████| 2/2 [00:00<00:00,  2.65it/s]

Nestle Nesquik Chocolate Lowfat Milk, Ready to Drink, 14 fl oz





**Node ID:** 7df4d94f-0467-41d4-aa62-3bc0bd84af9f<br>**Similarity:** 0.016666666666666666<br>**Text:** great value 1% low fat milk, gallon, 128 fl oz<br>

**Node ID:** f9bcf19c-086c-4db0-9d25-809da1949d3c<br>**Similarity:** 0.01639344262295082<br>**Text:** lactaid 2% reduced fat milk, 96 oz<br>

**Node ID:** 8f08705b-54f2-49b3-ae36-a5deb3fef0e3<br>**Similarity:** 0.016129032258064516<br>**Text:** lactaid whole milk, 96 oz<br>

In [33]:
# queries = ['Great Value Large White Eggs, 60 Count']
# queries = ['Boneless Pork Loin Country Style Ribs, 2.3 - 3.8 lb Tray']
queries = ['Tree Hut Coconut Lime Shea Sugar Exfoliating and Hydrating Body Scrub, 18 oz.']
# queries = ['Wet Ones Antibacterial Fresh Scent Hand Wipes 40 Ct Canister, Hypoallergenic, Kills Germs, Leaves Hands Feeling Clean']
# queries = ['Nestle Nesquik Chocolate Lowfat Milk, Ready to Drink, 14 fl oz']
# queries = ['Tuscan Dairy Farms Whole Milk, Vitamin D Milk Quart - 1 Milk Bottle']
# queries = ['Fresh Green Kale, 1 lb Bag'] #does work best
# queries = ['Organic Marketside Fresh Kale Blend, Rainbow, Green, Red and Lacinato Kale, 1 lb Bag']
# queries = ['Great Value Organic Ground Cardamom, 1.8 oz']
# queries = ['Tostitos Scoops! Tortilla Chips Party Size, 14.5 oz Bag']
# queries = ['spaghetti']
results_dict = await run_queries(queries, [vector_retriever, bm25_retriever])
final_results = fuse_results(results_dict)
print(queries[0])
for n in final_results:
    display_source_node(n, source_length=500)


  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00,  6.19it/s]

Tree Hut Coconut Lime Shea Sugar Exfoliating and Hydrating Body Scrub, 18 oz.





**Node ID:** 27b10617-5cfb-456c-8286-bdf70dd973e6<br>**Similarity:** 0.016666666666666666<br>**Text:** dove white beauty bar 4 oz, 8 ct<br>

**Node ID:** bc357483-e448-4f53-b7e3-0e3e1120ee0f<br>**Similarity:** 0.01639344262295082<br>**Text:** irish spring bar soap for men, aloe mist deodorant bar soap, 3.7 oz, 12 pack<br>

**Node ID:** 3097441a-38b1-47d5-a156-f79ce7d99092<br>**Similarity:** 0.016129032258064516<br>**Text:** coast refreshing deodorant bar soap, classic scent, 3.2 oz, 8 bars<br>

In [34]:
# queries = ['Great Value Large White Eggs, 60 Count']
queries = ['Boneless Pork Loin Country Style Ribs, 2.3 - 3.8 lb Tray']
# queries = ['Tree Hut Coconut Lime Shea Sugar Exfoliating and Hydrating Body Scrub, 18 oz.']
# queries = ['Wet Ones Antibacterial Fresh Scent Hand Wipes 40 Ct Canister, Hypoallergenic, Kills Germs, Leaves Hands Feeling Clean']
# queries = ['Nestle Nesquik Chocolate Lowfat Milk, Ready to Drink, 14 fl oz']
# queries = ['Tuscan Dairy Farms Whole Milk, Vitamin D Milk Quart - 1 Milk Bottle']
# queries = ['Fresh Green Kale, 1 lb Bag'] #does work best
# queries = ['Organic Marketside Fresh Kale Blend, Rainbow, Green, Red and Lacinato Kale, 1 lb Bag']
# queries = ['Great Value Organic Ground Cardamom, 1.8 oz']
# queries = ['Tostitos Scoops! Tortilla Chips Party Size, 14.5 oz Bag']
# queries = ['spaghetti']
results_dict = await run_queries(queries, [vector_retriever, bm25_retriever])
final_results = fuse_results(results_dict)
print(queries[0])
for n in final_results:
    display_source_node(n, source_length=500)


  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00,  7.36it/s]

Boneless Pork Loin Country Style Ribs, 2.3 - 3.8 lb Tray





**Node ID:** ddd1f171-a2ea-4f96-aa2c-f1d309fcad47<br>**Similarity:** 0.016666666666666666<br>**Text:** pork country style ribs boneless, 1.1 - 2.5 lb tray<br>

**Node ID:** 3358aed3-701c-4635-8f43-ead51368ad7a<br>**Similarity:** 0.01639344262295082<br>**Text:** pork ribeye chops boneless, 2.0 - 3.3 lb tray<br>

**Node ID:** 8b430b85-cafa-4f9e-926a-777c45dc55a8<br>**Similarity:** 0.016129032258064516<br>**Text:** pork stew meat boneless, 0.9 - 2.0 lb tray<br>

In [31]:
pd.set_option('display.max_rows', 150)
df

Unnamed: 0,Description,Subcategory,Category,ID,lower_description
0,"Marketside Fresh Vegetable Medley Blend, 12 oz",Vegetable,Produce,75027076,"marketside fresh vegetable medley blend, 12 oz"
1,"Fresh Green Bell Pepper, Each",Vegetable,Produce,15186230,"fresh green bell pepper, each"
2,"Fresh Cilantro, Bunch",Vegetable,Produce,26810050,"fresh cilantro, bunch"
3,"Fresh Roma Tomato, Each",Vegetable,Produce,80385241,"fresh roma tomato, each"
4,"Fresh Jalapeno Pepper, Approx. 3-5 per 0.25 Pound",Vegetable,Produce,75384478,"fresh jalapeno pepper, approx. 3-5 per 0.25 pound"
5,"Fresh Cucumber, Each",Vegetable,Produce,56212846,"fresh cucumber, each"
6,"Marketside Fresh Spinach, 10 oz Bag, Fresh",Vegetable,Produce,22176614,"marketside fresh spinach, 10 oz bag, fresh"
7,"Great Value Chopped Spinach, 12 oz (Frozen)",Frozen Vegetable,Frozen,43399012,"great value chopped spinach, 12 oz (frozen)"
8,"Great Value Steamable Mixed Vegetables, Frozen...",Frozen Vegetable,Frozen,23298252,"great value steamable mixed vegetables, frozen..."
9,"Great Value Broccoli & Cauliflower, 12 oz (Fro...",Frozen Vegetable,Frozen,11322312,"great value broccoli & cauliflower, 12 oz (fro..."
