In [31]:
!pip install llama-index cohere datasets pandas
!pip install -U qdrant-client



Optional: install Rich to make error messages and stack traces easier to read.


In [32]:
! pip install 'rich[jupyter]'
%load_ext rich

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets<9,>=7.5.1->rich[jupyter])
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


Import your packages

In [33]:
import datetime
import os
import random
from pathlib import Path
from typing import Any

import pandas as pd
from datasets import load_dataset
from IPython.display import Markdown, display_markdown
from llama_index import (GPTVectorStoreIndex, ServiceContext,
                         SimpleDirectoryReader)
from llama_index.indices.postprocessor import FixedRecencyPostprocessor
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

Path.ls = lambda x: list(x.iterdir())
random.seed(42)  # This is the answer

In [34]:
os.environ['OPENAI_API_KEY'] = "<YOUR_OPENAI_API_KEY>"
os.environ['COHERE_API_KEY'] = "<YOUR_COHERE_API_KEY>"
os.environ['QDRANT_API_KEY'] = "<YOUR_QDRANT_API_KEY>"


def check_environment_keys():
    """
    Utility Function that you have the NECESSARY Keys
    """
    if os.environ.get('OPENAI_API_KEY') is None:
        raise ValueError(
            "OPENAI_API_KEY cannot be None. Set the key using os.environ['OPENAI_API_KEY']='sk-xxx'"
        )
    if os.environ.get('COHERE_API_KEY') is None:
        raise ValueError(
            "COHERE_API_KEY cannot be None. Set the key using os.environ['COHERE_API_KEY']='xxx'"
        )
    if os.environ.get("QDRANT_API_KEY") is None:
        print("[Optional] If you want to use the Qdrant Cloud, please get the Qdrant Cloud API Keys and URL")


check_environment_keys()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
def get_single_text(k):
    return f"Under the category:\n{k[product]}:\n{k['category']}\n{k['sale_price']}"


#df = pd.DataFrame(dataset)
df=pd.read_csv("/content/drive/MyDrive/bBProducts.csv")
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [36]:
df.shape

[1m([0m[1;36m27555[0m, [1;36m10[0m[1m)[0m

In [37]:
df.columns[1:]


[1;35mIndex[0m[1m([0m[1m[[0m[32m'product'[0m, [32m'category'[0m, [32m'sub_category'[0m, [32m'brand'[0m, [32m'sale_price'[0m,
       [32m'market_price'[0m, [32m'type'[0m, [32m'rating'[0m, [32m'description'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0m

In [38]:
df.columns[1]

[32m'product'[0m

Next, write these documents to text files in a directory. Each document will be written to a text file named after its date.

In [39]:
# Assuming df is your DataFrame containing the data
result_dict = {}

for index, row in df.iterrows():
    product = row['product']
    values_dict = {}

    # Exclude 'product' column and iterate through other columns
    for column in df.columns.difference(['product']):
        values_dict[column] = row[column]

    result_dict[product] = values_dict
result_dict


In [40]:
result_dict[list(result_dict.keys())[0]]

In [None]:
%%time
import os
write_dir = "sample"
if not os.path.exists(write_dir):
    os.mkdir(write_dir)
for key in result_dict.keys():
    data = result_dict[key]
    file_path = f"{write_dir}/"+f"{key}.txt".replace("/","_")
    print(file_path)
    with open(str(file_path), "w") as f:
        for k,v in data.items():
            f.write(f"{k}:{v}\n")

In [None]:
client = QdrantClient(":memory:")

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url="<YOUR_QDRANT_URL",
    api_key="YOUR_QDRANT_API_KEY",
)

## Load Data into LlamaIndex
LlamaIndex has a simple way to load documents from a directory. We can define a function to get the metadata from a file name, and pass this function to the `SimpleDirectoryReader` class.

In [None]:
write_dir = Path(write_dir).resolve()
write_dir.ls()

In [None]:
def get_file_metadata(file_name: str):
    """Get file metadata."""
    date_str = Path(file_name).stem.split("_")[1:4]
    return {"product": "-".join(date_str)}


documents = SimpleDirectoryReader(input_files=write_dir.ls(), file_metadata=get_file_metadata).load_data()

In [47]:
len(documents)

[1;36m23541[0m

In [48]:
# Slice the list to get the first 10 sample documents
documents = documents[:10]

Let's look at the date ranges in our dataset:

In [49]:
len(documents)

[1;36m10[0m

This `date` key is *necessary* for the Recency Postprocessor that we are going to use later.

We have to parse these documents into nodes and create our QdrantVectorStore:

In [50]:
# define service context (wrapper container around current classes)
service_context = ServiceContext.from_defaults()
vector_store = QdrantVectorStore(client=client, collection_name="BigBasket_Sample")

Next, we will create our `GPTVectorStoreIndex` from the documents. This operation might take some time as it's creating the index from the documents.

In [51]:
%%time
index = GPTVectorStoreIndex.from_documents(documents, vector_store=vector_store, service_context=service_context)

CPU times: user 55.8 ms, sys: 183 µs, total: 56 ms
Wall time: 3.97 s


In [52]:
query_engine = index.as_query_engine(similarity_top_k=10)

In [None]:
response = query_engine.query("Which product is microwave safe?.")
print(response)

In [None]:
response = query_engine.query("Which products have the ratings greater than 3.0 ?")
print(response)

In [None]:
recency_postprocessor = FixedRecencyPostprocessor(service_context=service_context, top_k=1)

In [None]:
top_k = 10  # set one, reuse from now on, ensures consistency

In [None]:
index_query_engine = index.as_query_engine(
    similarity_top_k=top_k,
)

In [None]:
recency_query_engine = index.as_query_engine(
    similarity_top_k=top_k,
    node_postprocessors=[recency_postprocessor],
)

In [None]:
cohere_rerank = CohereRerank(api_key=os.environ["COHERE_API_KEY"], top_n=top_k)
reranking_query_engine = index.as_query_engine(
    similarity_top_k=top_k,
    node_postprocessors=[cohere_rerank],
)

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=top_k,
    node_postprocessors=[cohere_rerank, recency_postprocessor],
)