In [1]:
import sys, os, time

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

from langchain_community.vectorstores import Pinecone as langchain_pinecone
from common import SpacySentenceTokenizer, get_indexing_configuration

load_dotenv("../.env")



True

In [2]:
df = pd.read_csv("../data/bigbasket_loreal.csv")
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,questions
0,935,6 Oil Nourish Conditioner,Beauty & Hygiene,Hair Care,Loreal Paris,159.0,199.0,Shampoo & Conditioner,4.3,Does your hair get frizzy and unmanageable? Ar...,['Is 6 Oil Nourish Conditioner for all hair ty...
1,984,Men Expert White Activ Intensive Whitening Foam,Beauty & Hygiene,Men's Grooming,Loreal Paris,499.0,499.0,Face & Body,3.0,LOreal Men Expert White Activ intensive whiten...,['Does LOreal Men Expert White Activ Intensive...


In [3]:
# tokenzier = SpacySentenceTokenizer()
# text = "This is first sentence. This is second sentence."
# docs = tokenzier.create_documents([text, text], metadatas=[{"id": 1}, {"id": 2}])
# print(docs)

In [4]:
indexing_config = 5
text_splitter, embeddings, dimension, index_name = get_indexing_configuration(indexing_config)
    
chunks = text_splitter.create_documents(df.description.values, metadatas=[{"product_name": i} for i in df["product"].values] )
len(chunks), text_splitter._chunk_size, text_splitter._chunk_overlap, index_name

(760, None, None, 'loreal-sst-openai-large')

In [5]:
def add_product_name_to_page_content(chunk):
    chunk.page_content = f"Product name: {chunk.metadata['product_name']}\n{chunk.page_content}"

for chunk in chunks:
    add_product_name_to_page_content(chunk)
    
print(chunks[0].page_content)

Product name: 6 Oil Nourish Conditioner
Does your hair get frizzy and unmanageable?


In [6]:
# instantiate a Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# First, check if our index already exists. If it doesn't, we create it
if index_name not in [index_info['name'] for index_info in pc.list_indexes()]:
    # we create a new index
    pc.create_index(name=index_name, metric="cosine", dimension=dimension, # The OpenAI embedding model uses 1536 dimensions`
                    spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"
                ) )

In [7]:
docsearch = langchain_pinecone.from_documents(chunks, embeddings, index_name=index_name)



In [10]:
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 760}},
 'total_vector_count': 760}

In [9]:
# if you already have an index, you can load it like this
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "Does Loreal Total Repair 5 Serum help in hair growth?"
docs = docsearch.similarity_search(query, k=10)
docs



[Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive and repair damaged hair.', metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive and repair damaged hair.', metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive and repair damaged hair.', metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive and repair damaged hair.', metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive and repair damaged hair.', metadata={'product_name': 'Total Repair 5 Serum'}),
 Document(page_content='Product name: Total Repair 5 Serum\nExplore our best hair products to revive

In [33]:
# pc.delete_index(index_name)