In [1]:
from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
from langchain_chroma import Chroma

import google.generativeai as genai

from dotenv import load_dotenv
import os
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

HF_TOKEN = os.getenv('HF_TOKEN')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
CHROMA_PATH = "chroma"

# Initialize Hugging Face embedding
hugging_face_ef = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
genai.configure(api_key=GOOGLE_API_KEY)
llm_model = genai.GenerativeModel("gemini-1.5-flash")


# Loading Docs

In [3]:
directory_path = "data"

In [25]:
# pdf
pdf_loader = DirectoryLoader(directory_path, glob="**/*.pdf", loader_cls=PyPDFLoader,show_progress=True,use_multithreading=True)
pdf_docs = pdf_loader.load()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:20<00:00, 10.05s/it]


In [5]:
# txt
txt_loader = DirectoryLoader(directory_path, glob="**/*.txt",loader_cls=TextLoader,show_progress=True,use_multithreading=True)
txt_docs = txt_loader.load()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 32.59it/s]


In [6]:
# md
md_loader = DirectoryLoader(directory_path, glob="**/*.md",loader_cls=UnstructuredMarkdownLoader,show_progress=True,use_multithreading=True)
md_docs = md_loader.load()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:33<00:00, 33.82s/it]


In [7]:
# docx
docx_loader = DirectoryLoader(directory_path, glob="**/*.docx",loader_cls=UnstructuredWordDocumentLoader,show_progress=True,use_multithreading=True)
docx_docs = docx_loader.load()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.95s/it]


In [8]:
html_loader = DirectoryLoader(directory_path, glob="**/*.html",loader_cls=UnstructuredHTMLLoader,show_progress=True,use_multithreading=True)
html_docs = html_loader.load()


  0%|          | 0/1 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.34s/it]


In [9]:
"""
Document(metadata={'source': 'data\\review_paper.pdf', 'page': 10}, page_content="‚Ä¢ AR content optimization: ...")
"""
pdf_docs[10]

Document(metadata={'source': 'data\\review_paper.pdf', 'page': 10}, page_content="‚Ä¢ AR content optimization: Improvement in the delivery of AR content through enhancements in adaptive\nsystems so that sensitivity to the viewer's proximity and angle of view is greatly enhanced to Ô¨Ålter the\ncontext so that the amount of appearing content is user-friendly.  Using customization attributes that\nincorporate recommendation and gamiÔ¨Åcation, and also personalizing the content to the person, will make\nthe AR-based food application more interactive for users. \n‚Ä¢ Therefore, the drivers of this gap between healthy intentions and actual behavior of users, will therefore\nlead to better tools for the facilitation of healthier choice. \n‚Ä¢ Expanding ingredient libraries: The further an ingredient database is expanded with local and cultural\ningredients, the more ingredient-substitution systems will both accurately and more appropriately react to\ndiverse diets. AI and Machine Learning: A

In [10]:
""" 
Document(metadata={'source': 'data\\food_ingredient_dataset.txt'}, page_content="D. Food Ingredient Databases:...")
"""
txt_docs

[Document(metadata={'source': 'data\\food_ingredient_dataset.txt'}, page_content="D. Food Ingredient Databases:\nAccessible and comprehensive food ingredient databases are a benchmark for mobile applications meant to\nenhance dietary choices and outcomes. It contains detailed data of food products, among which are the\ningredients, nutritional values, and potential allergens. [4] One such example includes the open source\ndatabase known as Open Food Facts, containing over 50,000 entries of food products present in 134\ncountries. [5] However, often, custom databases need to be created with regard to specific requirements so\nthat local food products may be covered. [8] \nOne is PHARA, which uses a client-server architecture with a MongoDB database to implement\nrecommendations of healthy foods. The database contains items as well as user profiles. It feeds this\ninformation into the application's recommendation engine whereby consumers marked out healthier foods\nthey liked and wanted 

In [None]:
"""
Document(metadata={'source': 'data\\ar_influence_on_market.md'}, page_content="AR's Influence on Consumer Behavior...")]
"""
md_docs

[Document(metadata={'source': 'data\\ar_influence_on_market.md'}, page_content="AR's Influence on Consumer Behavior: Potential and Pitfalls\n\nSources investigate the effects of Augmented Reality on consumer behavior, from developing the shopping experience to more challenges entailed in increasing complexity and ease of use. Although AR brings some innovative ways of interaction with the consumer in order to provide information, effectiveness is lower than the other, much more common influences, such as FOP labels.\n\nImproving Food Choices Using Augmented Reality\n\nA number of studies demonstrate the potential of AR to encourage healthier food choices. A study [9] explored the feasibility of an AR application which gives users personalized suggestions of healthy products to purchase in supermarkets. The application identified shelf products and overlaid color-coded flags, thus allowing users to point out healthy foods quickly along with bad ones to avoid. Another app [1] aimed to gu

In [None]:
"""
[Document(metadata={'source': 'data\\introduction_and_background.docx'}, page_content='Introduction And Background\...')]
"""
docx_docs

[Document(metadata={'source': 'data\\introduction_and_background.docx'}, page_content='Introduction And Background\nINTRODUCTION\nIt has been richly noted that information technology could catalyze an important set of benefits in the\nhealthcare area which would include improving the quality and reducing the cost of healthcare. The\nemergence of sensor-rich powerful smart phones to provide a rich set of user contextual information in real\ntime made it feasible to provide effective and affordable healthcare to nearly everyone via smartphones.\nMore specifically, well-designed mobile phone applications can empower individuals to proactively embrace\nhealth and wellness. No longer is the health care system made of a reactive system or placed sitting back\nwaiting for medical attention to surface via an ER visit. What once belonged to the clinic is now patient-\ncentered care. What once focused on the disease agenda is now wellness in health care.\nBased on the sheer number of excellent j

In [None]:
"""
[Document(metadata={'source': 'data\\ingredient_substituent.html'}, page_content="Ingredient Substitutions and FoodKG\....")]
"""
html_docs

[Document(metadata={'source': 'data\\ingredient_substituent.html'}, page_content="Ingredient Substitutions and FoodKG\n\nA comprehensive approach to ingredient substitutions and healthy alternatives using FoodKG and DIISH heuristic.\n\nIngredient Substitutions\n\nSource [10] has examined all possible identification directions and the suggestion of ingredient alternatives. FoodKG is described as a knowledge graph that allows ranking the most plausible alternatives for explicit semantic information, as well as the implicit semantics captured by word embeddings, leading users toward healthy choices based on dietary requirements and preferences.\n\nFoodKG: A Food Knowledge Graph\n\nFoodKG is a knowledge graph of recipes and their ingredients, sourced from various references:\n\nFood Category: FoodKG utilizes knowledge from the FoodOn ontology to classify ingredients.\n\nNutritional Content: FoodKG associates ingredients with USDA data, offering detailed nutritional information (calories, m

In [26]:
docs = pdf_docs + txt_docs + md_docs + docx_docs + html_docs

In [12]:
print(f"{len(pdf_docs) = }")
print(f"{len(txt_docs) = }")
print(f"{len(md_docs) = }")
print(f"{len(docx_docs) = }")
print(f"{len(html_docs) = }")
print(f"{len(docs) = }")

len(pdf_docs) = 37
len(txt_docs) = 1
len(md_docs) = 1
len(docx_docs) = 1
len(html_docs) = 1
len(docs) = 41


# Text Splitting

In [27]:
from numpy import add


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True
)


texts = text_splitter.split_documents(docs)
print(f"{len(texts) = }")

len(texts) = 172


In [24]:
texts

[Document(metadata={'source': 'data\\review_paper.pdf', 'page': 0, 'start_index': 0}, page_content="AI Based AR Application for Food Ingredients\nAnalysis: A Systematic Review\nPrathamesh K. Sable Mr. \n \n, \nParth C. Desai Mr. \n \n, \nShivam B. Thorat Mr. \n \n, \nKrunal A. Changan Mr. \n \n,\nShobha S. Raskar Mrs. \n1.\n Computer Engineering, Modern Education Society's Wadia College of Engineering, Pune, Pune, IND\nCorresponding authors: \nPrathamesh K. Sable Mr., \nprathameshks2003@gmail.com, \nParth C. Desai Mr.,\nparth.desai0910@gmail.com, \nShivam B. Thorat Mr., \nshivamthorat1077@gmail.com, \nKrunal A. Changan Mr.,\nkrunalchangan@gmail.com, \nShobha S. Raskar Mrs., \nshobha.raskar@mescoepune.org\nAbstract\nGroceries shopping forms the everyday need of most the person. However, there is a large variety of almost\nsimilar products that can be found lined up at the shelf in the supermarket. Customers spend the most time\nin the supermarket while reading the ingredient list on the

In [14]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        if "id" in chunk.metadata:
            continue
        source = chunk.metadata.get("source")
        if "page" not in chunk.metadata:
            chunk.metadata["page"] = 0
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [18]:
def add_to_chroma(chunks,embedding_function,CHROMA_PATH=CHROMA_PATH):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")
    
    # remove chunks which are not in chunk with ids but are in database
    deleted_chunks = existing_ids - set(chunk.metadata["id"] for chunk in chunks_with_ids)
    if len(deleted_chunks):
        print(f"üëâ Deleting documents: {len(deleted_chunks)}")
        db.delete(ids=list(deleted_chunks))

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"üëâ Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        print("‚úÖ No new documents to add")

In [16]:
def reset_db(CHROMA_PATH=CHROMA_PATH):
    # delete if previous exists
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH, ignore_errors=True)


In [28]:
add_to_chroma(texts,hugging_face_ef)

Number of existing documents in DB: 266
üëâ Deleting documents: 94
‚úÖ No new documents to add
