In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"


# download the pdf

if not os.path.exists(pdf_path):

  print("File doesn't exist, Start downloading....")

  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename = pdf_path

  response = requests.get(url)


  if response.status_code == 200:

    with open(filename, 'wb') as F:
      F.write(response.content)

    print(f"The file has been downloaded and saved as {filename}.")

  else:
    print(f"Failed to download the file. Status Code: {response.status_code}")

else:
  print("File already exists.")

File already exists.


In [2]:
import fitz

def basic_preprocessing(text : str ) -> str:
  """This is the helper function use for basic preprocessing."""

  cleaned_text = text.replace("\n", " ").strip()
  return cleaned_text


def open_and_read_pdf(pdf_path : str) -> str:

  pages_and_text = []
  doc = fitz.open(pdf_path)

  for page_number, text in enumerate(doc):
    text = text.get_text()
    text = basic_preprocessing(text)
    pages_and_text.append(
        {
            "page_number" : page_number - 41,
            "word_count" : len(text.split(" ")),
            "sentence_count" : len(text.split(". ")),
            "character_count" : len(text),
            "token_count" : len(text) / 4,
            "text" : text
        }
    )

  return pages_and_text

In [3]:
pages_and_text = open_and_read_pdf(pdf_path)

In [4]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x20279333690>

In [5]:
for items in pages_and_text:
  items["sentences"] = list(nlp(items['text']).sents)

  items["sentences"] = [str(sentences) for sentences in items["sentences"]]

  items["page_sentence_count_spacy"] = len(items["sentences"])

In [6]:
pages_and_text

[{'page_number': -41,
  'word_count': 4,
  'sentence_count': 1,
  'character_count': 29,
  'token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition',
  'sentences': ['Human Nutrition: 2020 Edition'],
  'page_sentence_count_spacy': 1},
 {'page_number': -40,
  'word_count': 1,
  'sentence_count': 1,
  'character_count': 0,
  'token_count': 0.0,
  'text': '',
  'sentences': [],
  'page_sentence_count_spacy': 0},
 {'page_number': -39,
  'word_count': 54,
  'sentence_count': 1,
  'character_count': 320,
  'token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'sentences': ['Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCH

In [7]:
num_sentence_chunk_size = 10

def split_sentences(sentences : list[str], slice_size :int = num_sentence_chunk_size) -> list[list[str]]:

  return [sentences[i : i + slice_size] for i in range(0, len(sentences), slice_size)]

In [8]:
for items in pages_and_text:

  items["sentences_chunks"] = split_sentences(items["sentences"])
  items["num_chunks"] = len(items["sentences_chunks"])

In [11]:
pages_and_text[320]

{'page_number': 279,
 'word_count': 298,
 'sentence_count': 13,
 'character_count': 1894,
 'token_count': 473.5,
 'text': 'Benefits of Sugar Substitutes  Consuming foods and beverages containing sugar substitutes may  benefit health by reducing the consumption of simple sugars, which  are higher in calories, cause tooth decay, and are potentially linked  to chronic disease. Artificial sweeteners are basically non-nutrients  though not all are completely calorie-free. However, because they  are so intense in sweetness they are added in very small amounts  to foods and beverages. Artificial sweeteners and sugar alcohols are  not “fermentable sugars” and therefore they do not cause tooth  decay. Chewing gum with artificial sweeteners is the only proven  way that artificial sweeteners promote oral health. The American  Dental Association (ADA) allows manufacturers of chewing gum to  label packages with an ADA seal if they have convincing scientific  evidence demonstrating their product eit

In [12]:
import re

pages_and_chunks = []

for items in pages_and_text:

  for chunk in items["sentences_chunks"]:

    chunk_dict = {}

    chunk_dict["page_number"] = items["page_number"]
    
    # join the sentences together into a paragraph-like, aka join the list of sentences into one paragraph

    joined_sentence_chunk = "".join(chunk).replace("  "," ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
    joined_sentence_chunk = re.sub(r"\s+", " ", joined_sentence_chunk) # This will remove all weird whitespaces



    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    # Get some stats

    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
    chunk_dict["chunk_sentence_count"] = len(chunk)
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

    pages_and_chunks.append(chunk_dict)

In [19]:
pages_and_chunks[422]

{'page_number': 258,
 'sentence_chunk': 'Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities. These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body',
 'chunk_char_count': 666,
 'chunk_word_count': 102,
 'chunk_sentence_count': 6,
 'chunk_token_count': 166.5}

In [9]:
import pandas as pd
df = pd.DataFrame(pages_and_chunks)

In [10]:
# Filter out DataFrame for rows under 30 tokens
min_token_length = 30

pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")

In [11]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path = 'sentence-transformers/all-mpnet-base-v2', device = 'cpu')

docs = []

for items in tqdm(pages_and_chunks_over_min_token_length):
  docs.append(items["sentence_chunk"])

print(f"Number of documents: {len(docs)}")


100%|██████████| 1680/1680 [00:00<?, ?it/s]

Number of documents: 1680





In [12]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [13]:
embedding_model = embedding_model.to(device)

In [14]:
%%time

embeddings = embedding_model.encode(
    docs,
    show_progress_bar = True,
    batch_size = 32,
    convert_to_tensor = True
)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

CPU times: total: 19 s
Wall time: 18.2 s


In [15]:
# save the embeddings to file
pages_and_chunks_over_min_token_length


[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_sentence_count': 1,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_sentence_count': 1,
  'chunk_token_count': 52.5},
 {'page_number': -37,
  'sentence_chunk': 'Contents Preface University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program xxv Ab

In [16]:
embeddings

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

In [17]:
for i, items in enumerate(pages_and_chunks_over_min_token_length):
    items["embedding"] = embeddings[i].cpu().numpy()

In [18]:
import random
random.sample(pages_and_chunks_over_min_token_length, k = 2)

[{'page_number': 661,
  'sentence_chunk': '• Abnormal heart rate Iron-deficiency anemia is diagnosed from characteristic signs and symptoms and confirmed with simple blood tests that count red blood cells and determine hemoglobin and iron content in blood. Anemia is most often treated with iron supplements and increasing the consumption of foods that are higher in iron. Iron supplements have some adverse side effects including nausea, constipation, diarrhea, vomiting, and abdominal pain. Reducing the dose at first and then gradually increasing to the full dose often minimizes the side effects of iron supplements. Avoiding foods and beverages high in phytates and also tea (which contains tannic acid and polyphenols, both of which impair iron absorption), is important for people who have iron-deficiency anemia. Eating a dietary source of vitamin C at the same time as iron-containing foods improves absorption of nonheme iron in the gut. Additionally, unknown compounds that likely reside i

In [19]:
# save the embeddings to file

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = 'text_chunks_and_embeddings_df.csv'

text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index = False)

In [20]:
# import saved files and check

df = pd.read_csv(embeddings_df_save_path)

df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_sentence_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,1,77.0,[ 6.74242675e-02 9.02281702e-02 -5.09549724e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,1,52.5,[ 5.52156121e-02 5.92139885e-02 -1.66167002e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,2,191.5,[ 2.79801711e-02 3.39813828e-02 -2.06426550e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,3,235.25,[ 6.82566985e-02 3.81274857e-02 -8.46855063e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,1,249.5,[ 3.30264606e-02 -8.49766284e-03 9.57159605e-...


In [21]:
import numpy as np
df["embedding"] = df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep = " "))

df['embedding']


0       [0.0674242675, 0.0902281702, -0.00509549724, -...
1       [0.0552156121, 0.0592139885, -0.0166167002, -0...
2       [0.0279801711, 0.0339813828, -0.020642655, 0.0...
3       [0.0682566985, 0.0381274857, -0.00846855063, -...
4       [0.0330264606, -0.00849766284, 0.00957159605, ...
                              ...                        
1675    [0.0185622461, -0.0164277833, -0.0127045615, -...
1676    [0.0334720835, -0.0570441261, 0.0151489452, -0...
1677    [0.0770515278, 0.00978559162, -0.0121817421, 0...
1678    [0.103045158, -0.0164702013, 0.00826844759, 0....
1679    [0.086377345, -0.0125359055, -0.0112746879, 0....
Name: embedding, Length: 1680, dtype: object

In [22]:
# convert out embedding to torch.tensor

embeddings = torch.tensor(np.vstack(df['embedding'].tolist())).to(device)

In [23]:
embeddings = embeddings.to(torch.float32)

In [24]:
query = "Macronutrients Functions"

query_embedding = embedding_model.encode(
    query,
    convert_to_tensor = True
).to(device)

In [25]:
query_embedding.shape

torch.Size([768])

In [26]:
query_embedding.dtype, embeddings.dtype

(torch.float32, torch.float32)

In [27]:
# Get similarty scores with the dot product

from time import perf_counter as timer

start_time = timer()

dot_scores = torch.matmul(query_embedding, embeddings.T)


end_time = timer()

print(f"Time taken to compute dot product similarity scores: {end_time - start_time} seconds")

Time taken to compute dot product similarity scores: 0.002534400000001824 seconds


In [28]:
# Get top k results

top_k = 5

top_k_scores , top_k_indices = torch.topk(dot_scores, k = top_k, sorted = True)

In [29]:
top_k_scores

tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0')

In [30]:
top_k_indices[0].item()

42

In [31]:
import textwrap

def print_wrapped(text : str, width : int = 150):
  wrapped_text = textwrap.fill(text, width = width)
  print(wrapped_text)

In [32]:
query = "Macrinutrients Functions"

print(f"Query: {query}\n")
print(f"Results: \n")

for score, idx in zip(top_k_scores, top_k_indices):

    print(f"Score : {score.item()}")
    print(f"Text:")
    print_wrapped(df.iloc[idx.item()]['sentence_chunk'], width = 150)
    print(f"Page Number: {df.iloc[idx.item()]['page_number']}")
    print("\n")

Query: Macrinutrients Functions

Results: 

Score : 0.6925809383392334
Text:
Macronutrients Nutrients that are needed in large amounts are called macronutrients. There are three classes of macronutrients: carbohydrates, lipids,
and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds. This chemical
energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions. A unit of
measurement of food energy is the calorie. On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied
by one thousand. A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition
food labels. Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield
calories. Carbohydrates Carbo

In [86]:
# functionizing the retrieval process

def retrieve_top_k_documents(query : str,
                             embeddings : torch.Tensor,
                             model : SentenceTransformer = embedding_model,
                             top_k : int = 5,
                             print_time : bool = True):
    
    query_embedding = model.encode(
        query,
        convert_to_tensor = True
    )

    start_time = timer()

    dot_scores = torch.matmul(query_embedding, embeddings.T)

    end_time = timer()

    # if print_time:
    #     print(f"Time taken to compute dot product similarity scores: {end_time - start_time} seconds")

    scores, indexes = torch.topk(dot_scores, k = top_k)

    return scores, indexes

In [34]:
def print_top_results_and_scores(query : str,
                                 embeddings :torch.Tensor,
                                 pages_and_chunks : list[dict] = df,
                                 model : SentenceTransformer = embedding_model,
                                 top_k : int = 5,
                                 print_time : bool = True):
    
    query_embedding = model.encode(
        query,
        convert_to_tensor = True
    ).to(device)

    start_time = timer()

    dot_scores = torch.matmul(query_embedding, embeddings.T)

    end_time = timer()

    if print_time:
        print(f"Time taken to compute dot product similarity scores: {end_time - start_time} seconds")

    top_k_scores, top_k_indices = torch.topk(dot_scores, k = top_k)

    print(f" Query : {query} \n")

    for scores, idx in zip(top_k_scores, top_k_indices):
        print(f"Score : {score}")
        print(f"Text")
        print_wrapped(df.iloc[idx.item()]['sentence_chunk'])
        print("\n")

In [35]:
query = "foods high in fiber"

print_top_results_and_scores(query = query, embeddings = embeddings)

Time taken to compute dot product similarity scores: 5.2799999991748336e-05 seconds
 Query : foods high in fiber 

Score : 0.6472818851470947
Text
• Change it up a bit and experience the taste and satisfaction of other whole grains such as barley, quinoa, and bulgur. • Eat snacks high in fiber,
such as almonds, pistachios, raisins, and air-popped popcorn. Add an artichoke and green peas to your dinner plate more 276 | Carbohydrates and
Personal Diet Choices


Score : 0.6472818851470947
Text
Dietary fiber is categorized as either water-soluble or insoluble. Some examples of soluble fibers are inulin, pectin, and guar gum and they are found
in peas, beans, oats, barley, and rye. Cellulose and lignin are insoluble fibers and a few dietary sources of them are whole-grain foods, flax,
cauliflower, and avocados. Cellulose is the most abundant fiber in plants, making up the cell walls and providing structure. Soluble fibers are more
easily accessible to bacterial enzymes in the large intestin

### Checking our local GPU memory availability

In [36]:
import torch

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = (gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb : .2f} GB")

Available GPU memory:  6.00 GB


In [37]:
!nvidia-smi

Sun Dec 28 11:49:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 571.96                 Driver Version: 571.96         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   51C    P4              8W /   55W |    1515MiB /   6141MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [38]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 5.99658203125 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


### Loading a LLM Locally

In [39]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# Bonus flash attention 2 = faster attention mechanism 
# # flash attention 2 requires GPU with a compute capability with a score of 8.0+ ( Ampere, Ada Lovelace, Hopper and above)
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = 'flash_attention_2'
else:
    attn_implementation = 'sdpa' # scaled dot product attention


# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id)


# Instantiate the LLM Module
llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = model_id,
    torch_dtype = torch.float16,
    low_cpu_mem_usage = False, # we want to use as much memory as we can
    attn_implementation = attn_implementation
).to(device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
def gemma_model_num_params(model : torch.nn.Module):

    return sum([params.numel() for params in model.parameters()])

gemma_model_num_params(llm_model)

2506172416

In [41]:
def model_mem_size(model : torch.nn.Module):

    # Get the model parameters and buffer sizes

    mem_params = sum([params.nelement() * params.element_size() for params in model.parameters()])
    mem_buffer = sum([buff.nelement() * buff.element_size() for buff in model.buffers()])

    # calculate the model sizes

    model_mem_bytes = mem_params + mem_buffer
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": model_mem_mb,
            "model_mem_gb": model_mem_gb}


model_mem_size(llm_model)

{'model_mem_bytes': 5012345344,
 'model_mem_mb': 4780.14501953125,
 'model_mem_gb': 4.668110370635986}

### Generate text with our Local LLM

In [42]:
input_text = "What are Macronutrients? What role they do play in human bodies?"


print(f"\n Input Text: \n {input_text}")

# Create a prompt template for instruction-tuned models

dialouge_template = [
    {"role": "user",
     "content":input_text}
]

# apply chat template

prompt = tokenizer.apply_chat_template(
    conversation = dialouge_template,
    tokenize = False,
    add_generation_prompt = True
)


print(f"\n Prompt (formatted):\n{prompt}")


 Input Text: 
 What are Macronutrients? What role they do play in human bodies?

 Prompt (formatted):
<bos><start_of_turn>user
What are Macronutrients? What role they do play in human bodies?<end_of_turn>
<start_of_turn>model



In [43]:
input_ids = tokenizer(prompt,
                      return_tensors = "pt").to(device)

input_ids

{'input_ids': tensor([[     2,      2,    106,   1645,    108,   1841,    708,  97586, 184592,
         235336,   2439,   4731,    984,    749,   1554,    575,   3515,  13266,
         235336,    107,    108,    106,   2516,    108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [44]:
%%time

output = llm_model.generate(**input_ids,
                             max_new_tokens = 256)

print(f"Model Output (tokens): \n{output[0]}\n")

Model Output (tokens): 
tensor([     2,      2,    106,   1645,    108,   1841,    708,  97586, 184592,
        235336,   2439,   4731,    984,    749,   1554,    575,   3515,  13266,
        235336,    107,    108,    106,   2516,    108,  21404, 235269,   1517,
        235303, 235256,    476,  13367,    576,  97586, 184592, 235292,    109,
           688,  12298,   1695, 184592,    688,    708,  37132,    674,    573,
          2971,   4026,    575,   2910,  15992,    577,  10528,   1426,   2962,
        235265,   2365,    708,  13538,   1280,   1378,   1872,  14486, 235292,
        186809, 184592,    578,  92800, 184592, 235265,    109,    688,  12298,
          1695, 184592,    688,    708,  37132,    674,    573,   2971,   4026,
           575,   8107,  15992, 235269,   3359, 235292,    109, 235287,   5231,
        156615,  56227,  66058,  34428,   4134,    604,    573,   2971, 235303,
        235256,   5999,    578,  29703, 235265,    108, 235287,   5231,  49471,
         66058, 

In [45]:
output_decoded = tokenizer.decode(output[0])
print(f"Model Output (Decoded): \n{output_decoded}\n")

Model Output (Decoded): 
<bos><bos><start_of_turn>user
What are Macronutrients? What role they do play in human bodies?<end_of_turn>
<start_of_turn>model
Sure, here's a summary of Macronutrients:

**Macronutrients** are nutrients that the body needs in large amounts to maintain good health. They are divided into two main categories: macronutrients and micronutrients.

**Macronutrients** are nutrients that the body needs in larger amounts, including:

* **Carbohydrates:** Provide energy for the body's cells and tissues.
* **Protein:** Builds and repairs tissues, enzymes, and hormones.
* **Fat:** Insulates the body, helps absorb vitamins, and provides energy.

**Micronutrients** are nutrients that the body needs in smaller amounts, including:

* **Vitamin B12:** Essential for red blood cell production.
* **Vitamin C:** Helps the body fight infection and disease.
* **Vitamin D:** Helps the body absorb calcium and phosphorus.
* **Potassium:** Helps regulate fluid balance and muscle contrac

### Augmentation

In [47]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]

query_list = gpt4_questions + manual_questions
query_list

['What are the macronutrients, and what roles do they play in the human body?',
 'How do vitamins and minerals differ in their roles and importance for health?',
 'Describe the process of digestion and absorption of nutrients in the human body.',
 'What role does fibre play in digestion? Name five fibre containing foods.',
 'Explain the concept of energy balance and its importance in weight management.',
 'How often should infants be breastfed?',
 'What are symptoms of pellagra?',
 'How does saliva help with digestion?',
 'What is the RDI for protein per day?',
 'water soluble vitamins']

In [97]:
import random

def prompt_formatter(
        query:str,
        context_items : list[dict]
) -> str:
    
    
    context = "- " + "\n - ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:""" 
    base_prompt = base_prompt.format(context=context,
                                     query=query)

    # Create prompt template for instruction-tuned model 
    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)
    
    return prompt

query = random.choice(query_list)
print(f"Query : {query}")
score, indices = retrieve_top_k_documents(query=query,
                                          embeddings=embeddings)


context_items = [df.iloc[i.item()] for i in indices]

prompt = prompt_formatter(query=query,
                          context_items=context_items)

print(prompt)


Query : What role does fibre play in digestion? Name five fibre containing foods.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.

Example 2:
Que

In [98]:
input_ids = tokenizer(prompt,
                      return_tensors = "pt").to(device)


output = llm_model.generate(**input_ids,
                            temperature = 0.7,
                            do_sample = True,
                            max_new_tokens = 256)

output_decoded = tokenizer.decode(output[0])

print(f"Query : {query}")
print(f"RAG Answer : {output_decoded.replace(prompt,"")}")

Query : What role does fibre play in digestion? Name five fibre containing foods.
RAG Answer : <bos>The relevant passage suggests that dietary fiber is a crucial component of a balanced diet for maintaining overall health. It explains that fiber promotes the growth and health of colonic cells, inhibits inflammatory responses in the colon, and stimulates the immune system. Additionally, it mentions the importance of fiber for regulating body temperature and facilitating the passage of waste products.

Five fiber-containing foods are mentioned in the context:

1. Whole grains
2. Vegetables
3. Beans
4. Nuts and seeds
5. Legumes<eos>


In [102]:
def ask(query : str,
        temperature : float = 0.7,
        max_new_tokens : int = 256,
        format_answer : bool = True,
        return_answer_only : bool = True):

    score, indices = retrieve_top_k_documents(query,
                                              embeddings=embeddings)
    
    context_items = [df.iloc[i.item()] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = score[i].cpu().item()

    prompt = prompt_formatter(query = query, context_items = context_items)

    input_ids = tokenizer(prompt, return_tensors = "pt").to(device)

    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    output_text = tokenizer.decode(outputs[0])


    if format_answer:
        # Replace prompt and special tokens
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")

    # Only return the answer without context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [104]:
query = random.choice(query_list)
print(f"Query: {query}")
ask(query=query,
    temperature=0.2,
    return_answer_only=False)

Query: What is the RDI for protein per day?


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item["score"] = score[i].cpu().item()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item["score"] = score[i].cpu().item()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item["score"] = score[i].cpu().item()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item["score"] = score[i].cpu().item()
A value is tryin

('The passage states that the RDI for protein is 0.8 grams of protein per kilogram of body weight.',
 [page_number                                                           410
  sentence_chunk          Most nitrogen is lost as urea in the urine, bu...
  chunk_char_count                                                      466
  chunk_word_count                                                       87
  chunk_sentence_count                                                    5
  chunk_token_count                                                   116.5
  embedding               [0.08559414, 0.012894514, 0.0214104373, 0.0256...
  score                                                             0.67929
  Name: 609, dtype: object,
  page_number                                                           409
  sentence_chunk          Proteins, Diet, and Personal Choices UNIVERSIT...
  chunk_char_count                                                     1006
  chunk_word_count                 