In [None]:
# !pip install langchain-openai
# !pip install langchain-huggingface 
# !pip install faiss-cpu


# :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``

^C


In [1]:
import pandas as pd
import json
from pprint import pprint
import os
import json
# from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from IPython.display import display, Markdown
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders import TextLoader, CSVLoader, JSONLoader, PyPDFLoader
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate


# Load the csv file which has the data

In [3]:
filename = "Resources\AmazonHomeKitchenReviews.csv"
# Load dataset (modify path as needed)
df = pd.read_csv(filename)
df.head(1)
print(len(df))
df_renamed = df.rename(columns={'title_y' : 'product_title','title_x':'review_title','text':'review_text'})

754079



# Load the api key for HuggingfaceRead

In [4]:
#replace with your own env file containing API keys
load_dotenv(find_dotenv('Resources\keys.env'))
huggingfacehubapi = os.getenv('HuggingfaceRead')
# print(huggingfacehubapi)


# Function to load csv as documents 

In [5]:

def load_docs(doc):
    loader = CSVLoader(doc)
    return loader.load()

In [6]:
docs = load_docs(filename)  # Change the filename accordingly
print(docs[0].page_content) 

: 59
rating: 5
title_x: Adorable!
text: These are so sweet. I do wish the stopper part was a little longer in length but they work great.
images: []
asin: B01HBWGU80
parent_asin: B01DR2ACA0
user_id: AGKHLEW2SOWHNMFQIJGBECAF7INQ
timestamp: 2019-07-23 04:29:16.671
helpful_vote: 0
verified_purchase: True
title_y: Little Bird Wine Bottle Stopper, Silicone Stoppers, Reusable, Leak Proof, Cute, Fun, Decorative, Multipack (Assorted Color, Set of 6)
description: []
price: 9.49
Brand: LouisChoice
Material: Silicone
Color: Assorted Color
categories: ['Home & Kitchen', 'Kitchen & Dining', 'Kitchen Utensils & Gadgets', 'Bar & Wine Tools', 'Wine Stoppers & Pourers', 'Wine Stoppers']


## Load the HuggingFaceEmbeddingsModel which is free and has no limit on the number of embeddings
## OpenAIembeddings has limit on the number of embeddings and would add a cost to the embeddings 

In [7]:
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")





## This is the function to create the page_content out of the product title, price,rating,color,categories,reviewtitle and review text of which the embeddings will be generated and will be used for semantic search

In [8]:
def load_docs(df, start_index=0):
    """Convert DataFrame into a list of Documents from a specific index onward."""
    docs = []
    for idx, row in df.iloc[start_index:].iterrows():
        # content = f"Title: {row.get('review_title', '')}. Review: {row.get('review_text', '')}"
        content = f"Product: {row.get('product_title', 'Unknown')}. Price: ${row.get('price', 'N/A')}.Rating: {row.get('rating', 'N/A')} stars. Color: {row.get('Color', 'N/A')}. Categories: {row.get('categories', 'N/A')}. ReviewTitle: {row.get('review_title', '')}. Review: {row.get('review_text', '')}"   
        doc = Document(page_content=content, metadata={"rating": row.get("rating", "N/A"),"price": row.get("price", "N/A"),"product_title": row.get("product_title", "N/A"),"parent_asin": row.get("parent_asin", "N/A") ,"index": idx })
        docs.append(doc)
        # print(idx)
    return docs

# Load all the records in the dataframe as documents using the load_docs function 

In [9]:
existing_size = 0    
# Get new documents to embed
docs = load_docs(df_renamed, start_index=existing_size)


# Check the page_content for one of the docs 

In [None]:
print(len(docs))
print((docs[1]))

754079
page_content='Product: Fortune Candy 8-Inch Fry Pan with Lid, 3-ply Skillet, 18/8 Stainless Steel, Dishwasher Safe, Induction Ready, Silver (Mirror Finish). Price: $24.99.Rating: 5 stars. Color: Mirror Finish. Categories: ['Home & Kitchen', 'Kitchen & Dining', 'Cookware', 'Pots & Pans', 'Skillets']. ReviewTitle: Stailess, healthier than coated pans. Review: Great little stainless steel, balanced, good weight, frying pan with lide' metadata={'rating': 5, 'price': 24.99, 'product_title': 'Fortune Candy 8-Inch Fry Pan with Lid, 3-ply Skillet, 18/8 Stainless Steel, Dishwasher Safe, Induction Ready, Silver (Mirror Finish)', 'parent_asin': 'B08C7JYKZH', 'index': 1}


# Create the path for the vector database.  If the path does not exists it will create the folder and create the vector for the first batch of 500 documents

In [10]:
faiss_index_path = "Resources/vector"  

batch_size = 500 
# Check if FAISS index exists
if os.path.exists(faiss_index_path):
    print("Loading existing FAISS index...")
    vector_store = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)
    existing_size = vector_store.index.ntotal  # Number of vectors stored
    print(f"Existing FAISS index has {existing_size} embeddings.")
else:
    print("Creating new FAISS index...")
    os.makedirs(os.path.dirname(faiss_index_path))
    # vector_store = None
    # vector_store = FAISS.from_documents(docs[:batch_size], embedding_model)
    # vector_store.save_local(faiss_index_path) 

Loading existing FAISS index...
Existing FAISS index has 205500 embeddings.


# This function will create the vector embeddings in batches and will store the vector file locally for every batch.  This way if we terminate this function, it will pick up from where it left off. 

In [12]:
def store_incrementally_in_faiss(docs, faiss_index_path, batch_size=500):
    """Loads existing FAISS index, adds new embeddings in batches, and saves back."""

    # Ensure the directory exists
    if not os.path.exists(os.path.dirname(faiss_index_path)):
        os.makedirs(os.path.dirname(faiss_index_path))

    # Check if FAISS index exists
    if os.path.exists(faiss_index_path):
        print("Loading existing FAISS index...")
        vector_store = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)
        existing_size = vector_store.index.ntotal  # Number of vectors stored
        print(f"Existing FAISS index contains {existing_size} embeddings.")
        start_index = existing_size//batch_size
    else:
        print("Creating new FAISS index...")
        vector_store = None
        existing_size = 0

    # Get only new documents
    new_docs = docs[existing_size:]
    
    if not new_docs:
        print("No new documents to embed. FAISS index is up-to-date.")
        return

    # Process remaining documents in batches and save each batch as they are generated
    # Even if this fails , it can start from where it left off 
    for i in range(start_index, len(new_docs), batch_size):
        batch = new_docs[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} documents starting from index {start_index}...")
        vector_store.add_documents(batch)  # Always add to the existing vector store
        vector_store.save_local(faiss_index_path)
        existing_size = vector_store.index.ntotal  # Number of vectors stored
        start_index = existing_size//batch_size

        
    # Check if FAISS index was created
    if os.path.exists(faiss_index_path + ".index"):
        print(f"FAISS index successfully saved at: {faiss_index_path}")
    else:
        print("FAISS index was NOT created! Check for errors.")

# Call the store_incrementally_in_fiass function 

In [None]:
store_incrementally_in_faiss(docs,faiss_index_path,500)