In [1]:
import os
import requests
import pickle

# Get PDF document path
pdf_path = "../PDF/book.pdf"

# Download PDF
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading...")

    # Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content) 
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")

else:
    print(f"File {pdf_path} exists.")

File ../PDF/book.pdf exists.


In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()

In [3]:
len(pages)

1179

In [4]:
pages[3].page_content

'Contents \nPreface \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram xxv \nAbout the Con tributors \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram xxvi \nAckno wledgemen ts \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram xl \nPart I. Chapter 1. Basic Conc epts in Nutrition \nIntroduc tion \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram 3 \nFood Quali ty \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram 14 \nUnits of Measur e \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram 18'

In [5]:
def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    text.page_content = text.page_content.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return text

In [6]:
pages[:5]

[Document(page_content='Human Nutrition: 2020 Edition', metadata={'source': '../PDF/book.pdf', 'page': 0}),
 Document(page_content='Human Nutrition: 2020 \nEdition \nUNIVER SITY OF HAWAI‘I AT MĀNOA \nFOOD SCIENCE AND HUMAN \nNUTRITION PROGRAM \nALAN TITCHENAL, SKYLAR HARA, \nNOEMI ARCEO CAA CBA Y, WILLIAM \nMEINKE-LA U, YA-YUN YANG, MARIE \nKAINO A FIALK OWSKI REVILLA, \nJENNIFER DRAPER, GEMAD Y \nLANGFELDER, CHER YL GIBBY , CHYNA \nNICOLE CHUN, AND ALLISON \nCALABRESE', metadata={'source': '../PDF/book.pdf', 'page': 2}),
 Document(page_content='Human Nutrition: 2020 Edition by Univer sity of H awai‘i at Mānoa F ood Science and \nHuman Nutrition Pr ogram is licensed under a Creative Commons A ttribution 4. 0 \nInternational L icense , except wher e otherwise noted.', metadata={'source': '../PDF/book.pdf', 'page': 3}),
 Document(page_content='Contents \nPreface \nUniv ersity of Hawai‘i a t Mānoa Food Scienc e and \nHuman Nutri tion Pr ogram and H uman Nutri tion \nProgram xxv \nAbout th

In [7]:
pages = list(map(text_formatter,pages))

In [8]:
pages[:5]

[Document(page_content='Human Nutrition: 2020 Edition', metadata={'source': '../PDF/book.pdf', 'page': 0}),
 Document(page_content='Human Nutrition: 2020  Edition  UNIVER SITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAA CBA Y, WILLIAM  MEINKE-LA U, YA-YUN YANG, MARIE  KAINO A FIALK OWSKI REVILLA,  JENNIFER DRAPER, GEMAD Y  LANGFELDER, CHER YL GIBBY , CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE', metadata={'source': '../PDF/book.pdf', 'page': 2}),
 Document(page_content='Human Nutrition: 2020 Edition by Univer sity of H awai‘i at Mānoa F ood Science and  Human Nutrition Pr ogram is licensed under a Creative Commons A ttribution 4. 0  International L icense , except wher e otherwise noted.', metadata={'source': '../PDF/book.pdf', 'page': 3}),
 Document(page_content='Contents  Preface  Univ ersity of Hawai‘i a t Mānoa Food Scienc e and  Human Nutri tion Pr ogram and H uman Nutri tion  Program xxv  About the Con tributors  Un

In [9]:
import numpy as np

page_len = [len(text.page_content) for text in pages]
print(f"Average no. of characters on a page: {sum(page_len)/len(page_len)}")

Average no. of characters on a page: 1258.748939779474


In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

In [11]:
from langchain.docstore.document import Document

chunked_pages = []

for page in pages:
    texts = text_splitter.split_text(page.page_content)
    
    for text in texts:
        chunked_pages.append(Document(page_content=text,metadata=page.metadata))

In [12]:
chunked_pages[:10]

[Document(page_content='Human Nutrition: 2020 Edition', metadata={'source': '../PDF/book.pdf', 'page': 0}),
 Document(page_content='Human Nutrition: 2020  Edition  UNIVER SITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAA CBA Y, WILLIAM  MEINKE-LA U, YA-YUN YANG, MARIE  KAINO A FIALK OWSKI REVILLA,  JENNIFER DRAPER, GEMAD Y  LANGFELDER, CHER YL GIBBY , CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE', metadata={'source': '../PDF/book.pdf', 'page': 2}),
 Document(page_content='Human Nutrition: 2020 Edition by Univer sity of H awai‘i at Mānoa F ood Science and  Human Nutrition Pr ogram is licensed under a Creative Commons A ttribution 4. 0  International L icense , except wher e otherwise noted.', metadata={'source': '../PDF/book.pdf', 'page': 3}),
 Document(page_content='Contents  Preface  Univ ersity of Hawai‘i a t Mānoa Food Scienc e and  Human Nutri tion Pr ogram and H uman Nutri tion  Program xxv  About the Con tributors  Un

In [28]:
CHUNKED_PAGES_PATH = "../chunked_pages/chunked_pages.pkl"

with open(CHUNKED_PAGES_PATH,'wb') as f:
    pickle.dump(chunked_pages,f,protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings

EMBEDDING_PATH = "../embedding/embedding.pkl"
# Initialize an instance of HuggingFaceEmbeddings with the specified parameters

if os.path.isfile(EMBEDDING_PATH):
    with open(EMBEDDING_PATH,'rb') as f:
        embeddings = pickle.load(f)
    
    print(f"Loaded downloaded embeddings from path {EMBEDDING_PATH}")
else:
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",            # Provide the pre-trained model's path
        model_kwargs={'device':'mps'},                  # Pass the model configuration options
        encode_kwargs={'normalize_embeddings': True}    # Pass the encoding options
    )
    
    with open(EMBEDDING_PATH,'wb') as f:
        pickle.dump(embeddings,f,protocol=pickle.HIGHEST_PROTOCOL)
        
    print(f"Saved downloaded embeddings to path {EMBEDDING_PATH}")

  from .autonotebook import tqdm as notebook_tqdm


Loaded downloaded embeddings from path ../embedding/embedding.pkl


In [14]:
from langchain.vectorstores import Chroma

DB_PATH = "../../app/human_nutrition_vectorstore"

if len(os.listdir(DB_PATH))==0:
    db = Chroma.from_documents(chunked_pages, embeddings, persist_directory=DB_PATH)
    db.persist()
    print(f"DB created at path {DB_PATH}")
else:
    db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
    print(f"Loaded DB created from path {DB_PATH}")

DB created at path ../../app/human_nutrition_vectorstore


  warn_deprecated(


In [15]:
query = "reduce the risk of developing diet-related chronic disease."
docs = db.similarity_search(query)

In [16]:
len(docs) #4

4

In [17]:
chunked_pages

[Document(page_content='Human Nutrition: 2020 Edition', metadata={'source': '../PDF/book.pdf', 'page': 0}),
 Document(page_content='Human Nutrition: 2020  Edition  UNIVER SITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAA CBA Y, WILLIAM  MEINKE-LA U, YA-YUN YANG, MARIE  KAINO A FIALK OWSKI REVILLA,  JENNIFER DRAPER, GEMAD Y  LANGFELDER, CHER YL GIBBY , CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE', metadata={'source': '../PDF/book.pdf', 'page': 2}),
 Document(page_content='Human Nutrition: 2020 Edition by Univer sity of H awai‘i at Mānoa F ood Science and  Human Nutrition Pr ogram is licensed under a Creative Commons A ttribution 4. 0  International L icense , except wher e otherwise noted.', metadata={'source': '../PDF/book.pdf', 'page': 3}),
 Document(page_content='Contents  Preface  Univ ersity of Hawai‘i a t Mānoa Food Scienc e and  Human Nutri tion Pr ogram and H uman Nutri tion  Program xxv  About the Con tributors  Un

In [18]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# Define your BM25 and Chroma DB retrievers here (replace with your specific setup)
bm25_retriever = BM25Retriever.from_documents(chunked_pages)
chroma_db_retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 4})

# Combine with optional weights (defaults to equal weights)
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_db_retriever], weights=[0.5, 0.5]
)
# Use the ensemble retriever for your query
query = "reduce the risk of developing diet-related chronic disease."
documents = ensemble_retriever.get_relevant_documents(query)[0]

  warn_deprecated(


In [19]:
documents

Document(page_content='f scien tific  evidenc e suppor ting tha t replacing r efined grains wi th whole gr ains  decreases the risk f or obesi ty, Type 2 diabe tes, and c ardiovascular  disease. W hole gr ains ar e great die tary sour ces o f fiber, vitamins,  miner als, he althy fa ts, and a v ast amoun t of benef icial plan t  chemic als, all o f which c ontribute to the ef fects of whole gr ains  on he alth. Ea ting', metadata={'source': '../PDF/book.pdf', 'page': 307})

In [24]:
from langchain.prompts import PromptTemplate

# Define your prompt template with placeholders for context and question
template = PromptTemplate(
    input_variables=["context", "question"],
    template="You are an expert in human nutrition and your task is to provide answer to the question using just the provided context. Don't use the context word in answer it should feel like an expert is answering. Please feel free to say you don't know if you are not able to deduce the answer from the context provided but don't try to make one, here is the context: {context}\nHuman: {question}\nAssistant:"
)

# Summarize retrieved documents for conciseness
prompt = template.format(context=documents, question=query)

In [25]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3:instruct",temperature=0.2)

llm.invoke(prompt)

"A simple yet effective approach to reducing the risk of developing diet-related chronic diseases is to incorporate more whole grains into one's diet. This can be achieved by replacing refined grains with their whole grain counterparts, which have been scientifically proven to decrease the risk of obesity, Type 2 diabetes, and cardiovascular disease. Whole grains are an excellent source of essential nutrients like fiber, vitamins, minerals, healthy fats, and beneficial plant compounds that collectively contribute to their positive effects on overall health."