In [1]:
import sys, os, time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

from langchain_community.vectorstores import Pinecone as langchain_pinecone
from common import SpacySentenceTokenizer, get_indexing_configuration

load_dotenv("../.env")



True

In [2]:
df = pd.read_csv("../data/bigbasket_beauty.csv")
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,questions
0,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,['Is Nivea Creme Soft Soap effective for washi...
1,103,Fructis Serum - Long & Strong,Beauty & Hygiene,Hair Care,Garnier,230.0,230.0,Shampoo & Conditioner,4.8,Garnier Fruits Long & Strong Strengthening Ser...,['Does Fructis Serum Long & Strong contain avo...


In [3]:
# tokenzier = SpacySentenceTokenizer()
# text = "This is first sentence. This is second sentence."
# docs = tokenzier.create_documents([text, text], metadatas=[{"id": 1}, {"id": 2}])
# print(docs)

In [4]:
indexing_config = 9
text_splitter, embeddings, emb_model_name, dimension, index_name = get_indexing_configuration(indexing_config)
    
chunks = text_splitter.create_documents(df.description.values, metadatas=[{"product_name": i} for i in df["product"].values] )
len(chunks), text_splitter._chunk_size, text_splitter._chunk_overlap, index_name

(2731, 200, 50, 'beauty-rc-cs200-co50-all-mpnet')

In [5]:
def add_product_name_to_page_content(chunk):
    chunk.page_content = f"Product name: {chunk.metadata['product_name']}\n{chunk.page_content}"

for chunk in chunks:
    add_product_name_to_page_content(chunk)
    
print(chunks[0].page_content)

Product name: Creme Soft Soap - For Hands & Body
Nivea Creme Soft Soap gives your skin the best care that it must get. The soft bar consists of Vitamins F and Almonds which are really skin gracious and help you get great skin. It provides the skin


In [7]:
# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# First, check if our index already exists. If it doesn't, we create it
if index_name not in [index_info['name'] for index_info in pc.list_indexes()]:
    # we create a new index
    pc.create_index(name=index_name, metric="cosine", dimension=dimension, # The OpenAI embedding model uses 1536 dimensions`
                    spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"
                ) )

In [8]:
docsearch = langchain_pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [9]:
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2000}},
 'total_vector_count': 2000}

In [10]:
# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "Does Loreal Total Repair 5 Serum help in hair growth?"
docs = docsearch.similarity_search(query, k=10)
docs

[Document(page_content="Product name: Total Repair 5 Serum\n5 Problems. 1 Solution. L'Oreal Paris Total Repair 5 Repairing Serum helps fight against the five visible signs of damaged hair - hair fall, dryness, roughness, dullness and split ends without", metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Gliss Hair Repair Extreme Serum - Deep Repair\nof the hair. Tired and damaged hair loses these natural building blocks and Gliss Deep Repair Serum Series is designed to strengthen them and fill gaps in the hair structure.', metadata={'product_name': 'Gliss Hair Repair Extreme Serum - Deep Repair'}),
 Document(page_content='Product name: Gliss Hair Repair Extreme Serum - Deep Repair\nof the hair. Tired and damaged hair loses these natural building blocks and Gliss Deep Repair Serum Series is designed to strengthen them and fill gaps in the hair structure.', metadata={'product_name': 'Gliss Hair Repair Extreme Serum - Deep Repair'}),
 Document(page_

In [11]:
# pc.delete_index(index_name)