In [3]:
import os 
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from urllib.request import urlretrieve
from pinecone import Pinecone, ServerlessSpec
from uuid import uuid4
import time
from tqdm.autonotebook import tqdm


<h2>Loading data</h2>

In [4]:
files = [
    "https://aubmc.org.lb/Documents/publications/patient_info/Nutrition-tips-cancer.pdf",
    "https://www.nyc.gov/assets/doh/downloads/pdf/cdp/kot-simple-steps.pdf",
    "https://www.lambethtogether.net/wp-content/uploads/2023/12/Thrive-for-Life-Healthy-Eating-and-Living-Guide.pdf",
    "https://sunnybrook.ca/uploads/YNC_guidelines.pdf",
    "https://www.cancer.org/content/dam/cancer-org/cancer-control/en/booklets-flyers/nutrition-for-the-patient-with-cancer-during-treatment.pdf",
    "https://jbsnourishwell.com/wp-content/uploads/2020/08/Immune-Boosting-Recipes-Dorine-Lam.pdf"
]

os.makedirs('nutrition-pdfs',exist_ok=True)

In [None]:
import os
from urllib.request import urlretrieve
from urllib.error import URLError, HTTPError

for url in files:
    try:
        file_path = os.path.join("nutrition-pdfs", url.rpartition("/")[2])
        urlretrieve(url, file_path)  
    except (URLError, HTTPError) as e:
        print(f"Failed to access {url}: {e}")




<h2>CHUNKING</h2>

In [6]:
loader = PyPDFDirectoryLoader("nutrition-pdfs") #extracts contents.
listed_docs = loader.load() 
listed_docs[3]

Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2020-04-01T13:35:51+07:00', 'moddate': '2020-04-01T13:36:05+07:00', 'trapped': '/False', 'source': 'nutrition-pdfs\\Immune-Boosting-Recipes-Dorine-Lam.pdf', 'total_pages': 23, 'page': 3, 'page_label': '4'}, page_content='4\nImmune-Boosting Recipes\nA Natural Guide to Supporting the Immune System')

In [7]:

len(listed_docs[3].page_content)

73

the page_content is big, so we need to chunk it and then save the chunks into the vector database

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 50
)

chunked_docs = text_splitter.split_documents(listed_docs)

In [9]:
chunk = chunked_docs[0]
print(chunk)

page_content='2
Immune-Boosting Recipes
Table of Contents
Introduction 3
A Natural Guide to Supporting the Immune System 4
Ginger Immune-Boosting Soup 5
 Turmeric Carrot Muffins 6
Ginger Tea 7
Miso Soup 8
Elderberry Syrup 9
Chicken Soup 10
Turmeric Pumpkin Chocolate Bread  11
Beet Soup 12-13' metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2020-04-01T13:35:51+07:00', 'moddate': '2020-04-01T13:36:05+07:00', 'trapped': '/False', 'source': 'nutrition-pdfs\\Immune-Boosting-Recipes-Dorine-Lam.pdf', 'total_pages': 23, 'page': 1, 'page_label': '2'}


<h2>Embedding</h2>

In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2", 
    model_kwargs={"device": "cpu"},
    encode_kwargs={"noramlize_embeddings": True},  
    # each element of the embedding vec/the vector's length.
)

  huggingface_embeddings = HuggingFaceBgeEmbeddings(


In [12]:
huggingface_embeddings.embed_query("healthy lifestyle")

[0.021097246557474136,
 0.11736216396093369,
 -0.0033356579951941967,
 0.09038276970386505,
 0.01594405062496662,
 0.0744909793138504,
 0.029698798432946205,
 -0.0019271569326519966,
 -0.06773744523525238,
 -0.025223594158887863,
 0.055608537048101425,
 -0.008975597098469734,
 -0.025906456634402275,
 -0.0904611274600029,
 0.07071403414011002,
 -0.030114520341157913,
 -0.004793976433575153,
 0.05068514868617058,
 -0.02524253912270069,
 0.04608156159520149,
 0.04724415764212608,
 0.0893820971250534,
 0.10140310227870941,
 0.05375228077173233,
 -0.06003967672586441,
 -0.009722604416310787,
 0.06718632578849792,
 -0.06300997734069824,
 0.009487570263445377,
 0.030938230454921722,
 0.034358441829681396,
 0.03607778251171112,
 0.06148277595639229,
 0.003869605017825961,
 -0.01803467608988285,
 0.01584046520292759,
 -0.007943956181406975,
 -0.0475761741399765,
 -0.04486670345067978,
 -0.028957577422261238,
 -0.036229491233825684,
 -0.05761326849460602,
 -0.00315973162651062,
 0.00237301760353

<h2>Vector DataBase</h2>

In [13]:
# Pinecone is a managed vector database optimized for handling datasets 
# where you store embeddings (vectors) generated from your data

In [None]:
pinecone_api = "**"
pc = Pinecone(api_key=pinecone_api)

In [15]:
#index: data structure enables efficient similarity search and retrieval of high-dimensional vectors
# optimize the search process
index_name = "langchain-test-index"  
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [16]:
##Initializes to hold the embeddings.
pine_vectorstore = PineconeVectorStore(index=index, embedding=huggingface_embeddings)

In [17]:
#Pinecone needs a unique identifier (ID) for each

In [18]:
#assigning IDs to each doc and adding them to the Pinecone vector store for eff similarity search and retrieval.
uuids = [str(uuid4()) for _ in range(len(chunked_docs))]

pine_vectorstore.add_documents(documents=chunked_docs, ids=uuids)

['56691952-5524-491a-bdb8-08bd6e8d854c',
 'f73a841e-4d16-4db0-bf75-66018914604c',
 'f999f853-0f8d-43e9-9c4f-b4bd2f45de8d',
 '9ea3eeac-d7a7-4821-8db3-5069b3e5fdcf',
 '349f37f5-811e-487f-9045-1f912c03fbc1',
 'b4961934-2e19-479f-9982-06bc6850f8e3',
 '3e42cd90-8048-479c-9d99-9359f9708eb0',
 '91879ce0-9603-4238-9f4a-84ee390c0866',
 'ad4f352a-1f34-4427-8227-a3ba8aec366f',
 '3da77dde-f407-4a02-bf4d-e6bbca2f13ae',
 '317f8441-c22a-4761-9227-4b62ee146aa6',
 '0de64de9-2078-4345-abb2-956d0971f812',
 'b47c0ad4-f8a5-4998-9645-c3dbec32497d',
 'e4dc7a2d-7175-4f56-a979-db0af800fc42',
 '7efc66d6-8366-411e-847f-342dbf6d5056',
 '4d3819c1-2624-41fd-8c6c-e614d687dec3',
 '1ce8c765-efdc-46e6-9a48-422b4621436d',
 'd4f11952-9b70-49e0-b352-f0db30dc07dc',
 'd4b9faa4-e836-4753-a186-d88b2fe8ef8f',
 '93e9395f-0592-4598-853a-4942983ed1da',
 '892a6aa3-821a-4258-a918-4c6a2e1f845a',
 '03764871-7ebe-4c7a-a8f3-86aa4c01688c',
 'e8795962-9196-4855-95bd-2493fa9dd3d1',
 'f7fd8148-70d3-4123-ac7c-6cdb5a3a8397',
 '52af8c46-e26b-

In [None]:
# local vector store 
vectorstore = FAISS.from_documents(chunked_docs,huggingface_embeddings)

In [20]:
query = "give me Immune boosting recipes"

<h4>use the similarity score on the vector_db</h2>

In [21]:
results = pine_vectorstore.similarity_search(query,k=5)
for res in results:
	print(f"{res.page_content} [{res.metadata}]")
	print("*****")

4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System [{'creationdate': '2020-04-01T13:35:51+07:00', 'creator': 'Adobe InDesign CS6 (Windows)', 'moddate': '2020-04-01T13:36:05+07:00', 'page': 3.0, 'page_label': '4', 'producer': 'Adobe PDF Library 10.0.1', 'source': 'nutrition-pdfs\\Immune-Boosting-Recipes-Dorine-Lam.pdf', 'total_pages': 23.0, 'trapped': '/False'}]
*****
4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System [{'creationdate': '2020-04-01T13:35:51+07:00', 'creator': 'Adobe InDesign CS6 (Windows)', 'moddate': '2020-04-01T13:36:05+07:00', 'page': 3.0, 'page_label': '4', 'producer': 'Adobe PDF Library 10.0.1', 'source': 'nutrition-pdfs\\Immune-Boosting-Recipes-Dorine-Lam.pdf', 'total_pages': 23.0, 'trapped': '/False'}]
*****
7
Immune-Boosting Recipes
INGREDIENTS A
INGREDIENTS B (OPTIONAL)
INGREDIENTS C (OPTIONAL)
• 1½ tsp grated ginger root
• 1½ cups of water
• 1 tsp honey or brown sugar to taste
• 1 tsp fennel seeds 
• 3 pods of crush

In [22]:
#apply similarity search to the query using vectorstore.similarity_search()
relevant_docs = vectorstore.similarity_search(query)
print(relevant_docs[0].page_content)

4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System


<h4>build a retriever to pass the similar context to LLM</h4>

In [23]:
pinecone_retriever = pine_vectorstore.as_retriever(search_type="similarity_score_threshold",search_kwargs={"k":3,"score_threshold" : 0.6})

In [None]:
#retrieves similar text from the vectorstore

retriever = vectorstore.as_retriever(search_type="similarity" , search_kwargs={"k" : 3})

In [None]:
access_token = "**"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = access_token

In [None]:
hf = HuggingFaceHub(
    repo_id='mistralai/Mistral-Nemo-Instruct-2407',#'mistralai/Mistral-7B-v0.1'
    #repo_id='google/byt5-small',
    model_kwargs = {"temperature" : 0.1 , "max_length" : 400}
)

  hf = HuggingFaceHub(


In [27]:
output = hf.invoke(query)
print(output)



give me Immune boosting recipes

I'm glad to help you boost your immune system with some delicious and nutritious recipes! Here are some immune-boosting ingredients and recipes to try:

1. **Citrus Fruits**: Rich in Vitamin C, which helps boost your immune system.
   - **Recipe: Immune-Boosting Smoothie**
     - Blend together 1 orange, 1 grapefruit, 1-inch ginger, 1 cup pineapple, and 1 cup coconut water. Add a scoop of protein powder for extra nutrition.

2. **Garlic**: Known for its antiviral and antibacterial properties.
   - **Recipe: Garlic and Lemon Chicken Soup**
     - Sauté 4 cloves of minced garlic in olive oil, add 1 diced onion, 2 diced carrots, and 2 diced celery stalks. Cook until softened. Add 6 cups of chicken broth, 1 cup of diced chicken, salt, pepper, and 1 lemon sliced. Simmer for 20 minutes.

3. **Ginger**: Has anti-inflammatory properties and helps fight infections.
   - **Recipe: Ginger Tea**
     - Steep 1-inch of grated ginger in hot water for 10 minutes. Stra

<h4>Prompt Template</h4>

In [28]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
. If you find the answer, write the answer in a concise way with five sentences maximum unless its a recipe it can be longer.
. Only use the provided context to answer the question. Do not use any external sources. 
. If the user is asking about medical advice or serious illness add the end that you are just a guide and he sould always refer to a doctor.
{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

<h4>Combine Every Thing</h4>

In [29]:
#retriever uusing pinecone
pinecone_retievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever = pinecone_retriever,
    return_source_documents = True,
    chain_type_kwargs={"prompt":PROMPT}
)

In [30]:
#retriever from a local vector store FAISS
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

In [31]:
pinecone_result = pinecone_retievalQA.invoke({"query":query})
print(pinecone_result['result'])



Use the following pieces of context to answer the question at the end. Please follow the following rules:
. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
. If you find the answer, write the answer in a concise way with five sentences maximum unless its a recipe it can be longer.
. Only use the provided context to answer the question. Do not use any external sources. 
. If the user is asking about medical advice or serious illness add the end that you are just a guide and he sould always refer to a doctor.
4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System

4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System

7
Immune-Boosting Recipes
INGREDIENTS A
INGREDIENTS B (OPTIONAL)
INGREDIENTS C (OPTIONAL)
• 1½ tsp grated ginger root
• 1½ cups of water
• 1 tsp honey or brown sugar to taste
• 1 tsp fennel seeds 
• 3 pods of crushed cardamom
• Blueber

In [32]:
pinecone_result_2 = pinecone_retievalQA.invoke({"query":"is processed meat linked to cancer??"})
print(pinecone_result_2['result'])



Use the following pieces of context to answer the question at the end. Please follow the following rules:
. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
. If you find the answer, write the answer in a concise way with five sentences maximum unless its a recipe it can be longer.
. Only use the provided context to answer the question. Do not use any external sources. 
. If the user is asking about medical advice or serious illness add the end that you are just a guide and he sould always refer to a doctor.
hot dogs and sausages. 
Why limit red meat and processed meat?
There is convincing evidence that eating higher 
amounts of red and processed meats increases 
colorectal cancer risk. The process (smoking, 
curing) and additives (preservatives, salt) used 
to preserve meats can form cancer causing

hot dogs and sausages. 
Why limit red meat and processed meat?
There is convincing evid

In [36]:
result = retrievalQA.invoke({"query":query})



In [37]:
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
. If you find the answer, write the answer in a concise way with five sentences maximum unless its a recipe it can be longer.
. Only use the provided context to answer the question. Do not use any external sources. 
. If the user is asking about medical advice or serious illness add the end that you are just a guide and he sould always refer to a doctor.
4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System

7
Immune-Boosting Recipes
INGREDIENTS A
INGREDIENTS B (OPTIONAL)
INGREDIENTS C (OPTIONAL)
• 1½ tsp grated ginger root
• 1½ cups of water
• 1 tsp honey or brown sugar to taste
• 1 tsp fennel seeds 
• 3 pods of crushed cardamom
• Blueberries
• Mint leaves
• A squeeze of lemon
METHOD

3
Immune-Boosting Recipes
I

In [38]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: nutrition-pdfs\Immune-Boosting-Recipes-Dorine-Lam.pdf, Page: 3
Content: 4
Immune-Boosting Recipes
A Natural Guide to Supporting the Immune System
----------------------------------------------------------------------------------------------------
There are 3 documents retrieved which are relevant to the query.
Relevant Document #2:
Source file: nutrition-pdfs\Immune-Boosting-Recipes-Dorine-Lam.pdf, Page: 6
Content: 7
Immune-Boosting Recipes
INGREDIENTS A
INGREDIENTS B (OPTIONAL)
INGREDIENTS C (OPTIONAL)
• 1½ tsp grated ginger root
• 1½ cups of water
• 1 tsp honey or brown sugar to taste
• 1 tsp fennel seeds 
• 3 pods of crushed cardamom
• Blueberries
• Mint leaves
• A squeeze of lemon
METHOD
--------------------------------------------------------------------------------------------------