In [1]:
import pandas as pd
import os
import requests

In [6]:
pdf_path = "/home/pranjal/Downloads/RAG_from_scratch/Human-Nutrition-2020-Edition-1598491699.pdf"

# Download the PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading now")

  url = "https://github.com/pranzalkhadka/RAG_from_scratch/raw/main/college_data.pdf"
  #url = "https://github.com/mrdbourke/simple-local-rag/blob/raw/main/human-nutrition-text.pdf"
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as college-information.pdf


In [2]:
import fitz
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def text_cleaner(text):
    # Replace newline characters with spaces and strip leading/trailing whitespaces
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

In [4]:
def read_pdf(pdf_path):
    # Open the PDF using PyMuPDF
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        # Get the text of the page
        text = page.get_text()
        # Clean the text
        text = text_cleaner(text)
        # Append the text to the list with other information
        pages_and_text.append({
            # page number to find where the infrmation came from
            "page_number": page_number-41,
            # length of the text
            "page_char_count": len(text),
            # number of words in the text
            "page_word_count": len(text.split()),
            # number of sentences in the text
            "page_sentence_count_raw": len(text.split(". ")),
            # number of tokens in the text
            "page_token_count": len(text)/4,
            "text": text
        })

    return pages_and_text

In [5]:
pdf_path = "/home/pranjal/Downloads/RAG_from_scratch/Human-Nutrition-2020-Edition-1598491699.pdf"

In [6]:
pages_and_text = read_pdf(pdf_path)
pages_and_text[:2]

1208it [00:00, 1314.37it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 0,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random
random.sample(pages_and_text, k=3)

[{'page_number': 1136,
  'page_char_count': 1841,
  'page_word_count': 278,
  'page_sentence_count_raw': 22,
  'page_token_count': 460.25,
  'text': 'In the United States, there are additional subgroups that are at  risk and are more likely than others to face hunger and malnutrition.  They include low-income families and the working poor, who are  employed but have incomes below the federal poverty level.  Senior citizens are also a major at-risk group. Many elderly people  are frail and isolated, which affects their ability to meet their dietary  requirements. In addition, many also have low incomes, limited  resources, and difficulty purchasing or preparing food due to health  issues or poor mobility. As a result, more than six million senior  citizens in the United States face the threat of hunger.6  One of the groups that struggles with hunger are the millions  of homeless people across North America. According to a recent  study by the US Conference of Mayors, the majority of rep

In [8]:
df = pd.DataFrame(pages_and_text)

In [9]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,0,1,0.0,
2,-39,320,42,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,30,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,116,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [10]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035
std,348.86387,560.382275,86.491465,6.548495,140.095569
min,-41.0,0.0,0.0,1.0,0.0
25%,260.75,762.0,109.0,5.0,190.5
50%,562.5,1231.5,183.0,10.0,307.875
75%,864.25,1603.5,239.0,15.0,400.875
max,1166.0,2308.0,393.0,39.0,577.0


In [11]:
from spacy.lang.en import English

In [12]:
nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence. This is a third sentence.")

assert len(list(doc.sents)) == 3

print(list(doc.sents))

[This is a sentence., This is another sentence., This is a third sentence.]


In [13]:
for item in tqdm(pages_and_text):

    # Parse the text with spaCy
    item["sentences"] = list(nlp(item["text"]).sents)

    # Convert all sentences to strings because default type is a spacy data type
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the number of sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:01<00:00, 831.83it/s]


In [14]:
random.sample(pages_and_text, k=1)

[{'page_number': 906,
  'page_char_count': 495,
  'page_word_count': 68,
  'page_sentence_count_raw': 4,
  'page_token_count': 123.75,
  'text': 'downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=477    906  |  Introduction',
  'sentences': ['downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).',
   ' Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
   '   An interactive or media e

In [15]:
df = pd.DataFrame(pages_and_text)

In [16]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035,10.319536
std,348.86387,560.382275,86.491465,6.548495,140.095569,6.300843
min,-41.0,0.0,0.0,1.0,0.0,0.0
25%,260.75,762.0,109.0,5.0,190.5,5.0
50%,562.5,1231.5,183.0,10.0,307.875,10.0
75%,864.25,1603.5,239.0,15.0,400.875,15.0
max,1166.0,2308.0,393.0,39.0,577.0,28.0


In [17]:
# break down our list of sentences into smaller chunks
# This is because mbedding models have some fixed capacity for tokens and exceeding that will cause information loss
num_sentence_chunk_size = 10

def split_list(lst, slice_size = num_sentence_chunk_size):
    # Split the list into chunks of size of 10
    return [lst[i:i + slice_size] for i in range(0, len(lst), slice_size)]

In [18]:
for item in tqdm(pages_and_text):

    # Split the sentences into chunks
    item["sentence_chunks"] = split_list(item["sentences"], slice_size=num_sentence_chunk_size)

    # Count the number of chunks
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 495135.27it/s]


In [19]:
random.sample(pages_and_text, k=1)

[{'page_number': -18,
  'page_char_count': 419,
  'page_word_count': 61,
  'page_sentence_count_raw': 2,
  'page_token_count': 104.75,
  'text': 'Careers in Nutrition  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  1143  Part XIX. Appendices  Appendix A  Appendix A  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  1151  Appendix B  Attributions  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  1156',
  'sentences': ['Careers in Nutrition  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  1143  Part XIX.',
   'Appendices  Appendix A  Appendix A  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  1151  Appendix B  Attributions  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Pro

In [20]:
df = pd.DataFrame(pages_and_text)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035,10.319536,1.525662
std,348.86387,560.382275,86.491465,6.548495,140.095569,6.300843,0.644397
min,-41.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,109.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,183.0,10.0,307.875,10.0,1.0
75%,864.25,1603.5,239.0,15.0,400.875,15.0,2.0
max,1166.0,2308.0,393.0,39.0,577.0,28.0,3.0


In [21]:
import re

In [22]:
# Create a new list of dictionaries each containing a single chunk of sentences with relative information

pages_and_chunks = []
# For each page
for item in tqdm(pages_and_text):
    # For each sentence chunk
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict= {}
        # Add the page number
        chunk_dict["page_number"] = item["page_number"]
        # concatenate the elements of the sentence_chunk list into a single string separated by spaces and then remove any leading/trailing whitespace from the resulting string
        joined_sentence_chunk = " ".join(sentence_chunk).replace(" ", " ").strip()
        # If there is .(capital letter) then add space after the full stop and then the capital letter
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)

100%|██████████| 1208/1208 [00:00<00:00, 75352.75it/s]


In [23]:
len(pages_and_chunks)

1843

In [24]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 696,
  'sentence_chunk': 'Bellingham  fluorosis by  Editmore /  Public  Domain  decay ranges between 0.7–1.2 milligrams per liter. Exposure to  fluoride at three to five times this concentration before the growth  of permanent teeth can cause fluorosis, which is the mottling and  discoloring of the teeth.  Figure 11.7 A Severe Case of Fluorosis  Fluoride’s benefits to mineralized tissues of the teeth are well  substantiated, but the effects of fluoride on bone are not as well  known. Fluoride is currently being researched as a potential  treatment for osteoporosis. The data are inconsistent on whether  consuming fluoridated water reduces the incidence of osteoporosis  and fracture risk. Fluoride does stimulate osteoblast bone building  activity, and fluoride therapy in patients with osteoporosis has been  shown to increase BMD. In general, it appears that at low doses,  fluoride treatment increases BMD in people with osteoporosis and  is more effective in increasing bo

In [25]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.381443,752.14433,130.788388,188.036082
std,347.78867,456.235348,80.556195,114.058837
min,-41.0,14.0,4.0,3.5
25%,280.5,322.5,54.0,80.625
50%,586.0,765.0,134.0,191.25
75%,890.0,1139.5,198.0,284.875
max,1166.0,1871.0,416.0,467.75


In [26]:
# For text with small number of tokens, we will remove them because they wont contain much useful information but take extra compute
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 29.0 | Text: A concentration gradient is a form of potential energy, like water  172  |  Electrolytes Important for Fluid Balance
Chunk token count: 11.0 | Text: 420  |  Proteins, Diet, and Personal Choices
Chunk token count: 16.75 | Text: Accessed January 20, 2018.  1032  |  The Effect of New Technologies
Chunk token count: 29.75 | Text: 2010). EH. Net Encyclopedia.  http://eh.net/?s=History+of+Food+and+Drug+Regulatio Protecting the Public Health  |  1011
Chunk token count: 22.0 | Text: Updated September 2003. Accessed  November 28,2017.  Discovering Nutrition Facts  |  735


In [27]:
# Filter rows under min token length
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0}]

In [28]:
random.sample(pages_and_chunks_over_min_token_len, k =1)

[{'page_number': 12,
  'sentence_chunk': 'Protein  Necessary for tissue formation, cell reparation, and  hormone and enzyme production. It is essential for  building strong muscles and a healthy immune system.  Carbohydrates  Provide a ready source of energy for the body and  provide structural constituents for the formation of  cells.  Fat  Provides stored energy for the body, functions as  structural components of cells and also as signaling  molecules for proper cellular communication. It  provides insulation to vital organs and works to  maintain body temperature.  Vitamins  Regulate body processes and promote normal  body-system functions.  Minerals  Regulate body processes, are necessary for proper  cellular function, and comprise body tissue.  Water  Transports essential nutrients to all body parts,  transports waste products for disposal, and aids with  body temperature maintenance.    Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educati

In [31]:
from sentence_transformers import SentenceTransformer

In [32]:
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2", device = "cuda")

In [36]:
sentences = ["I like apple", "Apple tastes good", "I am going to school"]
embeddings = embedding_model.encode(sentences)
embedding_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embedding_dict.items():
    print(f"Sentence: {sentence} | Embedding: {embedding}")

Sentence: I like apple | Embedding: [-3.57289910e-02  7.01562688e-02 -8.19500419e-05 -6.13328107e-02
  3.51409242e-02  1.03940973e-02 -2.00692825e-02 -3.34104896e-02
  2.29680389e-02 -1.59613770e-02 -2.44812500e-02  3.00852153e-02
  7.70163315e-04  6.79605082e-03 -5.50421923e-02  4.22703922e-02
  5.31536527e-02  4.09525074e-02 -3.31378472e-03  3.86013160e-03
 -2.55373283e-03  4.14209925e-02  1.90771483e-02 -2.22908072e-02
 -1.52514298e-02  2.04589274e-02 -1.79591980e-02  2.96158460e-03
  1.71427578e-02  4.01425213e-02 -2.41378415e-02  3.45577952e-04
 -7.49801397e-02 -6.30647596e-03  1.75654634e-06  1.62909664e-02
  2.68397089e-02  3.63095142e-02 -9.73099750e-03  5.33018187e-02
  1.17877722e-02  4.03850377e-02 -4.74252217e-02  2.40221526e-02
 -2.64692325e-02  3.51310149e-02  1.30193001e-02  3.03052273e-02
  1.19034396e-02  9.18840058e-03 -2.88293958e-02 -3.54352742e-02
 -3.80387753e-02  1.66958701e-02 -3.32450122e-02  4.68399413e-02
  6.12730198e-02 -1.04559381e-02  8.59402940e-02  2.88

In [37]:
embeddings[0].shape

(768,)

In [41]:
embedding_model.to("cuda")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [42]:
for item in tqdm(pages_and_chunks_over_min_token_len):
    # Encode the sentence chunk
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 1686/1686 [01:27<00:00, 19.30it/s]


In [43]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [44]:
# Import saved csv file containing the embeddings
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,[ 6.74242899e-02 9.02281702e-02 -5.09548699e-...
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,[ 5.52156121e-02 5.92139401e-02 -1.66167337e-...
2,-37,Contents Preface University of Hawai‘i at Mā...,797,147,199.25,[ 2.79801823e-02 3.39813717e-02 -2.06426494e-...
3,-36,Lifestyles and Nutrition University of Hawai‘...,976,179,244.0,[ 6.82566687e-02 3.81275155e-02 -8.46854225e-...
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,[ 3.30264382e-02 -8.49764794e-03 9.57158674e-...


In [45]:
import torch
import numpy as np 

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array 
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1686, 768])

In [46]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,"[0.0674242899, 0.0902281702, -0.00509548699, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,"[0.0552156121, 0.0592139401, -0.0166167337, -0..."
2,-37,Contents Preface University of Hawai‘i at Mā...,797,147,199.25,"[0.0279801823, 0.0339813717, -0.0206426494, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘...,976,179,244.0,"[0.0682566687, 0.0381275155, -0.00846854225, -..."
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,"[0.0330264382, -0.00849764794, 0.00957158674, ..."


In [47]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device) 

In [64]:
query = "macronutrients functions"

print(f"Query: {query}")

Query: macronutrients functions


In [65]:
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to(device)

dot_score = util.dot_score(a=query_embedding, b=embeddings)[0]

top_results_dot_product = torch.topk(dot_score, k=5)
print(top_results_dot_product)

torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0'),
indices=tensor([42, 47, 41, 51, 46], device='cuda:0'))


In [66]:
pages_and_chunks[42]

{'page_number': 5,
 'sentence_chunk': 'Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called  macronutrients. There are three classes of macronutrients:  carbohydrates, lipids, and proteins. These can be metabolically  processed into cellular energy. The energy from macronutrients  comes from their chemical bonds. This chemical energy is  converted into cellular energy that is then utilized to perform work,  allowing our bodies to conduct their basic functions. A unit of  measurement of food energy is the calorie. On nutrition food labels  the amount given for “calories” is actually equivalent to each calorie  multiplied by one thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels. Water is also a macronutrient in  the sense that you require a large amount of it, but unlike the other  macronutrients, it does not yield calories.  Carbohydrates  Carbohydrates are m

In [67]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [68]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'macronutrients functions'

Results:
Score: 0.6926
Text:
Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called
macronutrients. There are three classes of macronutrients:  carbohydrates,
lipids, and proteins. These can be metabolically  processed into cellular
energy. The energy from macronutrients  comes from their chemical bonds. This
chemical energy is  converted into cellular energy that is then utilized to
perform work,  allowing our bodies to conduct their basic functions. A unit of
measurement of food energy is the calorie. On nutrition food labels  the amount
given for “calories” is actually equivalent to each calorie  multiplied by one
thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is
synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels.
Water is also a macronutrient in  the sense that you require a large amount of
it, but unlike the other  macronutrients, it does not yield calories.
Carbohydrate

In [69]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

Dot product between vector1 and vector2: tensor(14.)
Dot product between vector1 and vector3: tensor(32.)
Dot product between vector1 and vector4: tensor(-14.)
Cosine similarity between vector1 and vector2: tensor(1.0000)
Cosine similarity between vector1 and vector3: tensor(0.9746)
Cosine similarity between vector1 and vector4: tensor(-1.0000)


Create a pipeline using functions

In [84]:
def retrieve_relevant_resources(query,
                                embeddings,
                                model= embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices


def print_top_results_and_scores(query,
                                 embeddings,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [85]:
query = "symptoms of pellagra"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 1686 embeddings


(tensor([0.5000, 0.3741, 0.2959, 0.2793, 0.2721], device='cuda:0'),
 tensor([ 826,  857, 1541, 1560, 1536], device='cuda:0'))

In [86]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 1686 embeddings
Query: symptoms of pellagra

Results:
Score: 0.5000
Niacin deficiency is commonly known as pellagra and  the symptoms include
fatigue, decreased appetite, and indigestion.   These symptoms are then commonly
followed by the four D’s:  diarrhea, dermatitis, dementia, and sometimes death.
Figure 9.12  Conversion of Tryptophan to Niacin  Water-Soluble Vitamins  |  565
Page number: 565


Score: 0.3741
car. Does it drive faster with a half-tank of gas or a full one? It does  not
matter; the car drives just as fast as long as it has gas. Similarly,  depletion
of B vitamins will cause problems in energy metabolism,  but having more than is
required to run metabolism does not speed  it up. Buyers of B-vitamin
supplements beware; B vitamins are not  stored in the body and all excess will
be flushed down the toilet  along with the extra money spent.  B vitamins are
naturally present in numerous foods, and many  other foods are enriched with
them.