# RAG basic outline

### Downloading sample pdf

In [1]:
import os 
import requests

pdf_path = './human-nutrition-text.pdf'
pdf_url = '' # enter url here
if not os.path.exists(pdf_path):
    print(f'Downloading file . . .')
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        print(f'File is downloaded as {pdf_path}')
    else:
        print(f'[ERROR] {response.status_code}')
else:
    print(f'File {pdf_path} already exists')


File ./human-nutrition-text.pdf already exists


Opening pdf

In [2]:
import fitz # !pip install PyMuPdf
from tqdm.auto import tqdm

def formatter(text):
    clean = text.replace("\n", " ").strip()
    return clean

def open_read(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []
    for page_no, page in tqdm(enumerate(doc)):
        text = formatter(page.get_text())
        pages.append({"page_no" : page_no-41 , 
                      "page_char_count" : len(text), 
                      "page_word_count" : len(text.split(" ")),
                      "page_sentence_count" : len(text.split(". ")), 
                      "page_token_count" : len(text)/4, 
                      "text": text})
    return pages

In [3]:
pages = open_read(pdf_path=pdf_path)
pages[:2]

0it [00:00, ?it/s]

[{'page_no': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_no': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [4]:
import pandas as pd
df = pd.DataFrame(pages)
df.head()

Unnamed: 0,page_no,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


### Text Splitting

In [5]:
from spacy.lang.en import English
nlp = English()
# Use spacy Sentencizer to split text to sentences
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7538e8f8dc90>

In [6]:
for item in tqdm(pages):
    item["sentences"] = list(nlp(item["text"]).sents)
    # Convert into strings from spacy datatype
    item["sentences"] = [str(x) for x in item["sentences"]]
    item["sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [7]:
pages[1098]

{'page_no': 1057,
 'page_char_count': 1841,
 'page_word_count': 308,
 'page_sentence_count': 18,
 'page_token_count': 460.25,
 'text': 'harmful microorganisms that can cause foodborne illnesses.  Therefore, people who primarily eat raw foods should thoroughly  clean all fruit and vegetables before eating them. Poultry and other  meats should always be cooked before eating.12  Vegetarian and Vegan Diets  Vegetarian and vegan diets have been followed for thousands of  years for different reasons, including as part of a spiritual practice,  to show respect for living things, for health reasons, or because of  environmental concerns. For many people, being a vegetarian is a  logical outgrowth of “thinking green.” A meat-based food system  requires more energy, land, and water resources than a plant-based  food system. This may suggest that the plant-based diet is more  sustainable than the average meat-based diet in the U.S.By avoiding  animal flesh, vegetarians hope to look after their ow

In [8]:
df = pd.DataFrame(pages)
df.head()

Unnamed: 0,page_no,page_char_count,page_word_count,page_sentence_count,page_token_count,text,sentences,sentence_count_spacy
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.0,,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


Chunking the sentences

In [9]:
chunk_size = 10

def split_list(input_list: list[str], 
               slice_size: int = chunk_size) -> list[list[str]]:
    return [input_list[i : i+slice_size] for i in range(0, len(input_list), slice_size)]

test = list(range(25))
split_list(test)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [10]:
for item in tqdm(pages):
    item["sentence_chunks"] = split_list(item["sentences"], chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [11]:
df = pd.DataFrame(pages)
df.describe().round(2)

Unnamed: 0,page_no,page_char_count,page_word_count,page_sentence_count,page_token_count,sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32,1.53
std,348.86,560.38,95.83,6.55,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0,1.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0,3.0


In [12]:
# splitting chunks into its own item

import re

pages_chunks = []
for item in tqdm(pages):
    for chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_no"]
        joined_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_chunk) # ".A" -> ". A"
        chunk_dict["sentence_chunk"] = joined_chunk
        chunk_dict["chunk_char_count"] = len(joined_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_chunk) / 4
        pages_chunks.append(chunk_dict)

len(pages_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [13]:
df = pd.DataFrame(pages_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.1,112.74,183.52
std,347.79,447.51,71.24,111.88
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,745.0,115.0,186.25
75%,890.0,1118.0,173.0,279.5
max,1166.0,1830.0,297.0,457.5


Filter chunks of texts with short chunks as they don't have much info

In [14]:
from random import sample

min_token_length = 30
for row in df[df["chunk_token_count"] < min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}")

Chunk token count: 11.0 | Text: 978 | Food Supplements and Food Replacements
Chunk token count: 13.25 | Text: PART IX CHAPTER 9. VITAMINS Chapter 9. Vitamins | 513
Chunk token count: 24.75 | Text: Table 9.33 Some Phytochemical’s Obtained from Diet and Their Related Functions 600 | Phytochemicals
Chunk token count: 16.75 | Text: PART XI CHAPTER 11. TRACE MINERALS Chapter 11. Trace Minerals | 649
Chunk token count: 16.25 | Text: Complementary foods include baby meats, vegetables, Infancy | 837


Most of the above are useless sentences

In [15]:
pages_chunks_mod = df[df["chunk_token_count"] >= min_token_length].to_dict(orient="records")
sample(pages_chunks_mod, k=1)

[{'page_number': 1072,
  'sentence_chunk': 'The Health At Every Size® Principles are: 1. Weight Inclusivity: Accepting and respecting the diversity of body shapes and sizes 2. Health Enhancement: Recognizing that health and well-being are multi-dimensional and that they include physical, social, spiritual, occupational, emotional, and intellectual aspects 3. Respectful Care: Promoting all aspects of health and well- being for people of all sizes 4. Eating for Well-being: Promoting eating in a manner which balances individualized nutritional needs, hunger, satiety, appetite, and pleasure 5. Life-Enhancing Movement: Promoting individually appropriate, enjoyable, life-enhancing physical activity, rather than exercise that is focused on a goal of weight loss In the Hawaiian language, Lokahi means “unity, agreement, accord, and harmony”.23 The concept of Lokahi can be used to describe the balance between the relationship an individual has with the body, the mind, the spirit, and the rest of

### Embedding text chunks

In [42]:
model_name1 = 'all-mpnet-base-v2'
model_name2 = 'all-MiniLM-L6-v2'

In [17]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer(model_name_or_path=model_name2, device='cuda')

# demo  
sentences = ['This is a demo sentence to demo the easy way. ', 
             'Today is a good day to like horses', 'Icecream is the best']
embeddings = embed_model.encode(sentences=sentences)
embed_dict = dict(zip(sentences, embeddings))

for sentence, embeddings in embed_dict.items():
    print(f"Sentence: {sentence}, \n Embedding: {embeddings}")



Sentence: This is a demo sentence to demo the easy way. , 
 Embedding: [ 2.24922150e-02  9.20272991e-02  1.33178579e-02  4.66368794e-02
  2.50298996e-02  2.26061549e-02 -8.93865246e-03  1.32816667e-02
 -6.94798604e-02  5.47935478e-02  9.02975351e-02 -9.00119916e-03
  1.86062511e-02 -1.08054159e-02  7.46785700e-02 -2.84844381e-03
 -2.50427332e-03  7.94730242e-03 -3.32841016e-02  5.36604933e-02
  2.20836326e-02 -5.19668199e-02  3.11571974e-02 -6.01953119e-02
 -9.41922516e-03  4.49277610e-02 -6.37429720e-03  5.98195717e-02
  1.44849882e-01 -1.01143774e-02 -1.28989089e-02  5.53992689e-02
  6.45775199e-02 -1.05384495e-02  2.76985597e-02 -2.73017306e-02
  3.02649736e-02  1.20514429e-04 -9.01867449e-03  2.80545112e-02
 -2.21211230e-03 -7.46103600e-02 -2.23740768e-02  3.74712497e-02
 -2.28923652e-02 -1.51640614e-02 -5.46111539e-02 -3.60314287e-02
  3.48155722e-02 -1.82313975e-02  2.44543329e-02 -8.21313336e-02
 -6.88981190e-02 -1.02191791e-01  1.66651495e-02 -1.76256374e-02
 -6.89869514e-03  1

In [20]:
embeddings.shape

(384,)

In [21]:
%%time

embed_model.to('cuda')
for item in tqdm(pages_chunks_mod): 
    item["embedding"] = embed_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 15.2 s, sys: 52 ms, total: 15.2 s
Wall time: 15.2 s


Using batch size instead

In [23]:
%%time

text_chunks = [item["sentence_chunk"] for item in pages_chunks_mod]
text_embeddings = embed_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)
text_embeddings

CPU times: user 20.7 s, sys: 67.6 ms, total: 20.8 s
Wall time: 19.6 s


tensor([[-1.9507e-02, -1.8256e-02,  7.1663e-04,  ..., -3.9904e-02,
         -2.1543e-02, -4.3294e-02],
        [ 2.6605e-02,  1.8786e-03, -7.0033e-03,  ..., -2.1591e-02,
         -2.1828e-02, -6.9988e-02],
        [ 3.4971e-02,  4.0597e-02, -1.9747e-02,  ..., -3.5225e-02,
         -3.1397e-02, -4.6613e-02],
        ...,
        [-3.2258e-02, -2.4768e-02, -1.1571e-02,  ..., -1.3906e-05,
          3.9905e-02, -5.2110e-02],
        [-8.8627e-02, -6.6496e-02, -1.0430e-01,  ..., -3.0128e-02,
          8.4004e-02, -4.2208e-02],
        [-1.1203e-01, -4.9535e-02, -2.4896e-02,  ...,  1.2983e-02,
          4.6920e-02, -2.8276e-02]], device='cuda:0')

Save embeddings to file

In [27]:
embeddings_save_path = 'text_and_embeddings.csv'

In [26]:
text_and_embeddings = pd.DataFrame(pages_chunks_mod)
text_and_embeddings.to_csv(embeddings_save_path, index=False)

In [28]:
# load it to use it again
text_and_embeddings = pd.read_csv(embeddings_save_path)
text_and_embeddings.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[-1.95069630e-02 -1.82563923e-02 7.16574199e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 2.66053174e-02 1.87852455e-03 -7.00324634e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,[ 3.49713489e-02 4.05969657e-02 -1.97472461e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,[ 2.63138041e-02 2.63623390e-02 -2.12261155e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[-4.99467552e-03 5.19443788e-02 -3.91638204e-...


### Retrieval

In [55]:
# import libraries if disconnected
import random
import torch
import numpy as np
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'
text_and_embeddings = pd.read_csv(embeddings_save_path)
text_and_embeddings["embedding"] = text_and_embeddings["embedding"].apply(lambda x : np.fromstring(x.strip("[]"), sep=" "))
embeddings = torch.tensor(np.stack(text_and_embeddings["embedding"].to_list(), axis=0, dtype=np.float32)).to(device)
pages_chunks_mod = text_and_embeddings.to_dict(orient="records")

In [56]:
embeddings.shape

torch.Size([1680, 384])

In [57]:
# Ready embedding model for semantic search
from sentence_transformers import util, SentenceTransformer

embed_model = SentenceTransformer(model_name_or_path=model_name2, device=device)



### Semantic Search Pipeline
Steps:
1. Make a query string
2. Turn the query string into an embedding
3. Perform a similarity function between the query embedding and text embeddings
4. Sort the results from 3 in descending order 

In [58]:
from time import perf_counter as timer

In [59]:
query = "functions of macronutrients"
print(f'Query : {query}')

query_embedding = embed_model.encode(query, convert_to_tensor=True).to(device=device)
start_time = timer()
dot_scores = util.dot_score(query_embedding, embeddings)[0]
end_time  = timer()
print(f'Time taken to compare: {end_time-start_time:.4f}, Embeddings length: {len(embeddings)}')

# Get the top 5 dot scores
results = torch.topk(dot_scores, k=5)


Query : functions of macronutrients
Time taken to compare: 0.0118, Embeddings length: 1680


In [62]:
results

torch.return_types.topk(
values=tensor([0.6505, 0.6417, 0.5846, 0.5800, 0.5699], device='cuda:0'),
indices=tensor([ 42,  41, 451,  47,  50], device='cuda:0'))

In [70]:
pages_chunks_mod[42]["sentence_chunk"]

'Macronutrients Nutrients that are needed in large amounts are called macronutrients. There are three classes of macronutrients: carbohydrates, lipids, and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds. This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions. A unit of measurement of food energy is the calorie. On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand. A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.'

Make vector search results pretty

In [71]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [72]:
print(f'Query: {query}')
for score, idx in zip(results[0], results[1]):
    print(f'Score : {score}')
    print(f'Text: \n')
    print_wrapped(pages_chunks_mod[idx]["sentence_chunk"])
    print(f'Page number: {pages_chunks_mod[idx]["page_number"]}')
    print('\n')

Query: functions of macronutrients
Score : 0.6505149006843567
Text: 

Macronutrients Nutrients that are needed in large amounts are called
macronutrients. There are three classes of macronutrients: carbohydrates,
lipids, and proteins. These can be metabolically processed into cellular energy.
The energy from macronutrients comes from their chemical bonds. This chemical
energy is converted into cellular energy that is then utilized to perform work,
allowing our bodies to conduct their basic functions. A unit of measurement of
food energy is the calorie. On nutrition food labels the amount given for
“calories” is actually equivalent to each calorie multiplied by one thousand. A
kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with
the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a
macronutrient in the sense that you require a large amount of it, but unlike the
other macronutrients, it does not yield calories. Carbohydrates Carbohydrate

Make above semantic search pipeline into a function

In [83]:
def retrieve(query, embeddings, model=embed_model, num_returns=5, print_time=False, print_results=False, data=None):

    query_embedding = model.encode(query, convert_to_tensor=True).to(device=device)
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    scores, indices = torch.topk(dot_scores, k=num_returns)

    if print_time:
        print(f'Time : {end_time-start_time:.6f}')
    
    if print_results:
        assert data 
        print(f'Query: {query}')
        for score, idx in zip(scores, indices):  
            print(f'Score : {score}')
            print(f'Text: \n')
            print_wrapped(pages_chunks_mod[idx]["sentence_chunk"])
            print(f'Page number: {pages_chunks_mod[idx]["page_number"]}')
            print('\n')
        
    return scores, indices

In [84]:
retrieve("foods high in fat", embeddings=embeddings, print_time=True, print_results=True, data=pages_chunks_mod)

Time : 0.000086
Query: foods high in fat
Score : 0.6621803641319275
Text: 

Thus, it is important to learn to reduce the intake of foods high in saturated
fat. High- fat foods can be consumed but they must fall within the overall goal
for a person’s fat allowance for the day. • Home cooking. Limit the use of
saturated fats in home preparation of meals. Instead of butter try spreads made
from unsaturated oils such as canola or olive oils and the use of cooking
sprays. Couple this with the use of herbs and spices to add flavor. Avoid using
high-fat meat gravies, cheese, and cream sauces. Limit adding extras to foods
such as butter on a baked potato. Use nonfat sour cream instead. Grill, bake,
stir- fry, roast, or bake your foods.
Page number: 353


Score : 0.6503816246986389
Text: 

A Personal Choice about Lipids UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND
HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM A Guide to Making Sense of
Dietary Fat On your next trip to the grocery store

(tensor([0.6622, 0.6504, 0.6447, 0.6349, 0.6176], device='cuda:0'),
 tensor([538, 532, 504, 502, 457], device='cuda:0'))

### LLM for Generation

Check GPU memory available

In [87]:
import torch
gpu_mem_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_gb = round(gpu_mem_bytes / (2**30))
print(f'Available GPU memory : {gpu_gb}GB')

Available GPU memory : 2GB


In [90]:
%%capture
# install groq for llm
!pip install groq
# dotenv helps with env variables
!pip install python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [100]:
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

In [102]:
model_llm_name = 'llama3-8b-8192'

In [104]:
%%time
input_text = 'What are the macronutrients and what role do they play in the human body?'
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": input_text,
        }
    ],
    model=model_llm_name,
)

CPU times: user 13 ms, sys: 3.18 ms, total: 16.2 ms
Wall time: 1.82 s


In [105]:
print(chat_completion.choices[0].message.content)

The three main macronutrients are carbohydrates, proteins, and fats. Each of these macronutrients plays a vital role in the human body, providing energy, building and repairing tissues, and regulating various bodily functions. Here's a brief overview of each:

1. **Carbohydrates**:
	* Major source of energy for the body (approximately 55-60% of daily energy needs)
	* Broken down into glucose, which is absorbed into the bloodstream and transported to cells for energy production
	* Simple carbohydrates (sugars) are quickly digested and provide rapid energy, while complex carbohydrates (starches and fibers) are digested more slowly and provide sustained energy
	* Examples: bread, pasta, fruits, vegetables, grains
2. **Proteins**:
	* Build and repair tissues, such as muscles, bones, skin, and hair (approximately 15-20% of daily energy needs)
	* Played a crucial role in producing enzymes, hormones, and other biomolecules
	* Can be used as a source of energy when carbohydrate stores are depl

### Augmentation

In [117]:
def prompt_formatter(query:str, context_items:list[dict]) -> str:
    
    context = '- ' + "\n- ".join([item["sentence_chunk"] for item in context_items])
    prompt = f"""Based on the following context items answer the query. 
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, return only the answer.
    Make sure your answers are as explanatory as possible.
    Context_items: {context}
    Query : {query}
    Return your answers in the below format:
    Query : <the query here>
    Answer : <Your answer>
    """
    return prompt

In [118]:
query = "What are the best sources of protein?"
scores, indices = retrieve(query=query, embeddings=embeddings, print_time=True)
context_items = [pages_chunks_mod[i] for i in indices]
prompt = prompt_formatter(query, context_items=context_items)

print(f'Query : {query}')
print(f'Prompt : \n {prompt}')

Time : 0.000519
Query : What are the best sources of protein?
Prompt : 
 Based on the following context items answer the query. 
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, return only the answer.
    Make sure your answers are as explanatory as possible.
    Context_items: - Dietary Sources of Protein The protein food group consists of foods made from meat, seafood, poultry, eggs, soy, dry beans, peas, and seeds. According to the Harvard School of Public Health, “animal protein and vegetable protein probably have the same effects on health. It’s the protein package that’s likely to make a difference.”1 1. Protein: The Bottom Line. Harvard School of Public Proteins, Diet, and Personal Choices | 411
- Foods that contain some of the essential amino acids are called incomplete protein sources, while those that contain all nine essential amino acids are called complete protein sources, or high- 

In [122]:
%%time
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model=model_llm_name,
)
rag_answer = chat_completion.choices[0].message.content
print(rag_answer.replace(prompt, ''))

Query : What are the best sources of protein?
Answer : The best sources of protein are animal-based foods such as milk, cheese, eggs, fish, poultry, and meat, and a few plant-based foods such as soy and quinoa, which are complete protein sources, also known as high-quality protein sources.
CPU times: user 29.4 ms, sys: 6.63 ms, total: 36.1 ms
Wall time: 819 ms


### Making the answering pipeline

In [124]:
def ask(query:str, llm_model:str, embeddings:torch.tensor, data:list[dict], print_time:bool) -> str:
    start_time = timer()
    _, indices = retrieve(query=query, embeddings=embeddings)
    context_items = [pages_chunks_mod[i] for i in indices]
    prompt = prompt_formatter(query, context_items=context_items) 
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model=llm_model,
    )
    rag_answer = chat_completion.choices[0].message.content
    end_time = timer()
    if print_time:
        print(f'Total time taken : {end_time-start_time:.5f}')
    return rag_answer.replace(prompt, '')

In [1]:
result = ask("What are the best sources of fibre?", model_llm_name, embeddings, pages_chunks_mod, True)
print(result)

NameError: name 'ask' is not defined