In [1]:
import sys, os, time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

from langchain_community.vectorstores import Pinecone as langchain_pinecone
from common import SpacySentenceTokenizer, get_indexing_configuration

load_dotenv("../.env")



True

In [2]:
df = pd.read_csv("../data/bigbasket_beauty.csv")
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,questions
0,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,['Is Nivea Creme Soft Soap effective for washi...
1,103,Fructis Serum - Long & Strong,Beauty & Hygiene,Hair Care,Garnier,230.0,230.0,Shampoo & Conditioner,4.8,Garnier Fruits Long & Strong Strengthening Ser...,['Does Fructis Serum Long & Strong contain avo...


In [3]:
# tokenzier = SpacySentenceTokenizer()
# text = "This is first sentence. This is second sentence."
# docs = tokenzier.create_documents([text, text], metadatas=[{"id": 1}, {"id": 2}])
# print(docs)

In [4]:
indexing_config = 5
text_splitter, embeddings, emb_model_name, dimension, index_name = get_indexing_configuration(indexing_config)

chunks = text_splitter.create_documents(df.description.values, metadatas=[{"product_name": i} for i in df["product"].values] )
len(chunks), text_splitter._chunk_size, text_splitter._chunk_overlap, index_name

(1999, 200, 50, 'beauty-rc-cs200-co50-openai-small')

In [5]:
def add_product_name_to_page_content(chunk):
    chunk.page_content = f"Product name: {chunk.metadata['product_name']}\n{chunk.page_content}"
    chunk.metadata = {}

for chunk in chunks:
    add_product_name_to_page_content(chunk)
    
print(chunks[0].page_content)

Product name: Creme Soft Soap - For Hands & Body
Nivea Creme Soft Soap gives your skin the best care that it must get. The soft bar consists of Vitamins F and Almonds which are really skin gracious and help you get great skin. It provides the skin


In [6]:
# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# First, check if our index already exists and delete stale index
if index_name in [index_info['name'] for index_info in pc.list_indexes()]:
    pc.delete_index(index_name)

# we create a new index
pc.create_index(name=index_name, metric="cosine", dimension=dimension, # The OpenAI embedding model uses 1536 dimensions`
                spec=ServerlessSpec(
                cloud="aws",
                region="us-west-2"
            ) )
time.sleep(10)

In [7]:
docsearch = langchain_pinecone.from_documents(chunks, embeddings, index_name=index_name)





In [8]:
time.sleep(10)
# connect to index
index = pc.Index(index_name)
# view index stats
stats = index.describe_index_stats()
stats

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1999}},
 'total_vector_count': 1999}

In [9]:
assert stats['total_vector_count'] == len(chunks)

In [10]:
# if you already have an index, you can load it like this
# docsearch = langchain_pinecone.from_existing_index(index_name, embeddings)

query = "Is Taft Ultimate Wax a gel or a wax?"
docs = docsearch.similarity_search(query, k=10)
docs



[Document(page_content='Product name: Taft Ultimate Wax\nis structured like wax and holds like a gel. It helps to protect the hair from drying out. It has the highest taffeta hold and crystal shine effect.'),
 Document(page_content='Product name: Taft Ultimate Wax\nUltimate strongest taffeta-stop for 100% fixation with liquid crystal gloss effect that gives the hair a luxurious radiance. Ultimate Wax stop that is structured like wax and holds like a gel. It'),
 Document(page_content='Product name: Taft Shine Gel Wax\nTaft Hair Wax is a sculpting product in the Schwarzkopf line and it is used to add hold and shine to hair. The base formula includes vegetable or mineral oils that work to build a finish without being'),
 Document(page_content='Product name: Taft Shine Gel Wax\noils that work to build a finish without being sticky.'),
 Document(page_content='Product name: Taft Power Wax\nwax and holds like a gel, all-weather will work for wet or dry hair. Make your hairdo an elegant and st

In [None]:
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# pc.delete_index(index_name)