In [1]:
import sys, os, time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Pinecone as langchain_pinecone
from langchain_openai import OpenAIEmbeddings

load_dotenv("../.env")

True

In [2]:
# import nltk
# nltk.download('punkt')

In [4]:
# FiQA dataset
# docs = pd.DataFrame(load_dataset("BeIR/fiqa", "corpus", split="corpus"))
# queries = pd.DataFrame(load_dataset("BeIR/fiqa", "queries", split="queries"))

# BigBasket dataset
# https://www.kaggle.com/datasets/surajjha101/bigbasket-entire-product-list-28k-datapoints
df = pd.read_csv("../data/bigbasket.csv")
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."


In [5]:
df.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

In [6]:
# df[df.category == "Beauty & Hygiene"].brand.value_counts().head(50)

In [7]:
df = df[df['brand'] == 'Garnier']
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
102,103,Fructis Serum - Long & Strong,Beauty & Hygiene,Hair Care,Garnier,230.0,230.0,Shampoo & Conditioner,4.8,Garnier Fruits Long & Strong Strengthening Ser...
162,163,Color Naturals Creme Riche Ultra Hair Color - ...,Beauty & Hygiene,Hair Care,Garnier,199.0,199.0,Hair Color,4.1,Garnier Color Naturals is creme hair colour wh...
212,213,Black Naturals Hair Colour Shade 1-Deep Black ...,Beauty & Hygiene,Hair Care,Garnier,108.78,117.0,Hair Color,4.2,It is an oil-enriched cream colour which gives...
447,448,Black Naturals,Beauty & Hygiene,Hair Care,Garnier,37.0,39.0,Hair Color,4.2,Garnier Black Naturals is enriched with the go...
1329,1330,Skin Natural Serum sheetmask (Light complete +...,Beauty & Hygiene,Skin Care,Garnier,211.61,297.0,Face Care,,Garnier introduces a new generation of face ma...


In [8]:
# df.to_csv("../data/bigbasket_garnier.csv", index=False)

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
# https://api.python.langchain.com/en/latest/_modules/langchain/text_splitter.html#RecursiveCharacterTextSplitter
# chunk_size: int = 4000, chunk_overlap: int = 200,
chunks = text_splitter.create_documents(df.description.values, metadatas=[{"product_name": i} for i in df["product"].values] )
len(chunks)

254

In [21]:
def add_product_name_to_page_content(chunk):
    chunk.page_content = f"Product name: {chunk.metadata['product_name']}\n{chunk.page_content}"

for chunk in chunks:
    add_product_name_to_page_content(chunk)
    
chunks[0].page_content

'Product name: Fructis Serum - Long & Strong\nGarnier Fruits Long & Strong Strengthening Serum detangles unruly hair, softens hair without heaviness and helps stop split and breakage ends. It is enriched with the goodness of Grape seed and'

In [22]:
index_name = "webinar-200-50"

# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [23]:
# First, check if our index already exists. If it doesn't, we create it
if index_name not in [index_info['name'] for index_info in pc.list_indexes()]:
    # we create a new index
    pc.create_index(name=index_name, metric="cosine", dimension=1536, # The OpenAI embedding model uses 1536 dimensions`
                    spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"
                ) )

In [24]:
docsearch = langchain_pinecone.from_documents(chunks, embeddings, index_name=index_name)



In [25]:
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 254}},
 'total_vector_count': 254}

In [19]:
# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "Does Garnier Fruits Long & Strong Strengthening Serum help in hair growth?"
docs = docsearch.similarity_search(query, k=10)
docs



[Document(page_content='Product name: Fructis Serum - Long & Strong\nGarnier Fruits Long & Strong Strengthening Serum detangles unruly hair, softens hair without heaviness and helps stop split and breakage ends. It is enriched with the goodness of Grape seed and avocado oil that result in smoother, shinier and longer hair.  For Beauty tips, tricks & more visit https://bigbasket.blog/', metadata={'product_name': 'Fructis Serum - Long & Strong'}),
 Document(page_content='Product name: Fructis - Long & Strong Strengthening Shampoo\nTransform weak and brittle hair into luscious locks with Garniers Long and Strong Fructus Fortifying Shampoo. This shampoo nourishes hair from the root, right down to the tip. The nutrients and minerals are absorbed within the hair and strengthened from within. Ensure your hair is taken care of the correct way with this shampoo that promises to nourish your hair and keep it detangled, without that unpleasant, heavy feeling. Make your hair shiny, luscious and st

In [15]:
# pc.delete_index('webinar')