In [1]:
import os
import requests

Downloading The Document (PDF File)

In [2]:
# get document name
pdf_name = "human-nutrition-text.pdf"

if not os.path.exists(pdf_name):
    print(f"Downloading {pdf_name}")

    # url of document to download

    pdf_url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    resp = requests.get(pdf_url)

    if resp.status_code == 200:
        with open(pdf_name, "wb") as f:
            f.write(resp.content)
        print(f"Downloaded {pdf_name} and Saved")
    else:
        print(f"Failed to download {pdf_name}")
else:
    print(f"{pdf_name} already exists")


Downloading human-nutrition-text.pdf
Downloaded human-nutrition-text.pdf and Saved


Open the PDF

In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10


In [4]:
!pip install spacy



In [5]:
import fitz  # PyMuPDF !pip install pymupdf
from tqdm.auto import tqdm

def text_formatter(text:str)->str:
    """perform basic text preprocessing. """

    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text


def open_and_read_pdf(pdf_name:str)->list[dict]:
    doc = fitz.open(pdf_name)
    pages_and_textx = []

    for page_no,page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_textx.append({
            "page_number" : page_no+1,
            "page_char_count" : len(text),
            "page_word_count" : len(text.split()),
            "page_sentence_count_raw" : len(text.split(". ")),
            "page_token_count" : len(text)/4,
            "text" : text
            })
    return pages_and_textx

pages_and_texts = open_and_read_pdf(pdf_name)



In [6]:
import random

random.sample(pages_and_texts,k=3)

[{'page_number': 792,
  'page_char_count': 839,
  'page_word_count': 138,
  'page_sentence_count_raw': 14,
  'page_token_count': 209.75,
  'text': 'Meal  Calories Total Meal/Snack  Calories  Breakfast  1 scrambled egg  92  \xa0\xa0\xa0with sliced mushrooms and spinach  7  ½ whole-wheat muffin  67  1 tsp. margarine-like spread  15  1 orange  65  8 oz. low-sodium tomato juice  53  299  Snack  6 oz. fat-free flavored yogurt  100  \xa0\xa0\xa0with ½ c. raspberries  32  132  Lunch  1 sandwich on pumpernickel bread  160  \xa0\xa0\xa0with smoked turkey deli meat,  30  \xa0\xa0\xa04 slices tomato  14  \xa0\xa0\xa02 lettuce leaves  3  \xa0\xa0\xa01 tsp. mustard  3  1 oz. baked potato chips  110  ½ c. blueberries, with 1 tsp. sugar  57  8 oz. fat-free milk  90  467  Snack  1 banana  105  7 reduced-fat high-fiber crackers  120  225  Dinner  1 c. Greek salad (tomatoes, cucumbers,  feta)  150  \xa0\xa0\xa0with 5 Greek olives,  45  \xa0\xa0\xa0with 1.5 tsp. olive oil  60  3 oz. grilled chicken breas

In [7]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition
1,2,0,0,1,0.0,
2,3,320,42,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,4,212,30,1,53.0,Human Nutrition: 2020 Edition by University of...
4,5,797,116,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [8]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035
std,348.86387,560.382275,86.491465,6.187226,140.095569
min,1.0,0.0,0.0,1.0,0.0
25%,302.75,762.0,109.0,4.0,190.5
50%,604.5,1231.5,183.0,10.0,307.875
75%,906.25,1603.5,239.0,14.0,400.875
max,1208.0,2308.0,393.0,32.0,577.0


Why we need token Count?

1. Embeding Model don't deal with infinite tokens.
2. LLMs don't deal with lnfinite tokens.

# Splitting Pages into sentences

Two days to do this
1. split by `". "` OR `".\n"`
2. Use NLP like spaCy or NLTK

In [9]:
from spacy.lang.en import English

nlp = English()

# add a sentencilizer pipeline
nlp.add_pipe("sentencizer")

# create a document example
doc = nlp("THis is sentence1. I like cats")

list(doc.sents)

[THis is sentence1., I like cats]

In [10]:
pages_and_texts[600]

{'page_number': 601,
 'page_char_count': 863,
 'page_word_count': 117,
 'page_sentence_count_raw': 8,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.\xa0Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.  \xa0https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitami

In [11]:
from tqdm import tqdm

for page in tqdm(pages_and_texts):
    page["sentences"] = [str(sent) for sent in nlp(page['text']).sents]

    page["page_sentence_count"] = len(page["sentences"])

100%|██████████| 1208/1208 [00:02<00:00, 474.02it/s]


In [12]:
pages_and_texts[103]

{'page_number': 104,
 'page_char_count': 1693,
 'page_word_count': 264,
 'page_sentence_count_raw': 13,
 'page_token_count': 423.25,
 'text': 'Basic Biology, Anatomy, and  Physiology  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The Basic Structural and Functional Unit of Life:  The Cell  What distinguishes a living\xa0organism from an inanimate object? A  living organism conducts self-sustaining biological processes. A cell  is the smallest and most basic form of life.  The cell theory incorporates three principles:  Cells are the most basic building units of life.\xa0All living things  are composed of cells. New cells are made from preexisting cells,  which divide in two. Who you are has been determined because  of two cells that came together inside your mother’s womb. The  two cells containing all of your genetic information (DNA) united to  begin making new life. Cells divided and differentiated into other  cells with specif

In [13]:
df = pd.DataFrame(pages_and_texts)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035,10.319536
std,348.86387,560.382275,86.491465,6.187226,140.095569,6.300843
min,1.0,0.0,0.0,1.0,0.0,0.0
25%,302.75,762.0,109.0,4.0,190.5,5.0
50%,604.5,1231.5,183.0,10.0,307.875,10.0
75%,906.25,1603.5,239.0,14.0,400.875,15.0
max,1208.0,2308.0,393.0,32.0,577.0,28.0


## Chunking the sentences together

here we are getting in 10 sentences at a time.

Framework like langchain can do this task easily.

## Need of Chunking
1. makes easy to filter
2. context of embedding model
2. context of llm

In [14]:
# define number of sentences chunk
sentences_chunk_size = 10

# actual function to split list of sentences to list of list of size <= 10 of sentences

def split_sentence_chunk(sentence_list:list[str],size_chunk:int = sentences_chunk_size) -> list[list[str]]:
    return [sentence_list[i:i + size_chunk] for i in range(0,len(sentence_list),size_chunk)]

test  = list(range(1,26))
print(test)
split_sentence_chunk(test)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [21, 22, 23, 24, 25]]

In [15]:
# make chunks of original data
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_sentence_chunk(item["sentences"],sentences_chunk_size)
    item["chunk_count"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 227309.07it/s]


In [16]:
random.sample(pages_and_texts,k=1)

[{'page_number': 307,
  'page_char_count': 1796,
  'page_word_count': 264,
  'page_sentence_count_raw': 17,
  'page_token_count': 449.0,
  'text': 'a diet high in fructose could potentially stimulate fat deposition and  weight gain.  In human studies, excessive fructose intake has sometimes been  associated with weight gain, but results are inconsistent. Moderate  fructose intake is not associated with weight gain at all. Moreover,  other studies show that some fructose in the diet actually improves  glucose metabolism especially in people with Type 2 diabetes.5  In fact, people with diabetes were once advised to use fructose  as an alternative sweetener to table sugar. Overall, there is no good  evidence that moderate fructose consumption contributes to  weight gain and chronic disease. At this time conclusive evidence  is not available on whether fructose is any worse than any other  added sugar in increasing the risk for obesity, Type 2 diabetes, and  cardiovascular disease.  Do Low

In [17]:
df=pd.DataFrame(pages_and_texts)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count,chunk_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035,10.319536,1.525662
std,348.86387,560.382275,86.491465,6.187226,140.095569,6.300843,0.644397
min,1.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,302.75,762.0,109.0,4.0,190.5,5.0,1.0
50%,604.5,1231.5,183.0,10.0,307.875,10.0,1.0
75%,906.25,1603.5,239.0,14.0,400.875,15.0,2.0
max,1208.0,2308.0,393.0,32.0,577.0,28.0,3.0


## Splitting chunks to separate items
so that we can add meta data to each chunk ans can embed them together.


In [18]:
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for chunk in item["sentence_chunks"]:
        chunk_dict = {}

        chunk_dict["page_number"] = item["page_number"] # original page number
        chunk_dict["text"] =  " ".join(chunk).replace("  "," ").strip() # original text
        chunk_dict["sentence_chunk"] = chunk
        chunk_dict["chunk_char_count"]  =  len(chunk_dict["text"])
        chunk_dict["chunk_word_count"]  =  len([word for word in chunk_dict["text"].split() if word])
        chunk_dict["chunk_token_count"] = chunk_dict["chunk_char_count"] / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 31558.51it/s]


1843

In [None]:
random.sample(pages_and_chunks,k=1)

[{'page_number': 210,
  'text': 'Image by Allison Calabrese / CC BY 4.0 The Kidneys Detect Blood Volume The kidneys are two bean-shaped organs, each about the size of a fist and located on either side of the spine just below the rib cage. The kidneys filter about 190 liters of blood and produce (on average) 1.5 liters of urine per day. Urine is mostly water, but it also contains electrolytes and waste products, such as urea. The amount of water filtered from the blood and excreted as urine is dependent on the amount of water in, and the electrolyte composition in the blood. Kidneys have protein sensors that detect blood volume from the pressure, or stretch, in the blood vessels of the kidneys. When blood volume is low, kidney cells detect decreased pressure and secrete the enzyme, renin. Renin travels in the blood and cleaves another protein into the active hormone, angiotensin. Angiotensin targets three different organs (the adrenal glands, the hypothalamus, and 168 | Regulation of Wa

In [19]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,625.38,735.11,112.85,183.78
std,347.79,447.64,71.28,111.91
min,1.0,12.0,3.0,3.0
25%,322.5,315.0,45.0,78.75
50%,628.0,747.0,115.0,186.75
75%,932.0,1119.0,173.0,279.75
max,1208.0,1832.0,298.0,458.0


# remove chunks with less than 30 token

In [20]:
min_chunk_token_limit = 30

pages_and_chunks_filtered = df[df['chunk_token_count'] > min_chunk_token_limit].to_dict(orient="records")

In [21]:
random.sample(pages_and_chunks_filtered,k=1)

[{'page_number': 119,
  'text': 'From the Small Intestine to the Large Intestine The process of digestion is fairly efficient. Any food that is still incompletely broken down (usually less than ten percent of food consumed) and the food’s indigestible fiber content move from the small intestine to the large intestine (colon) through a connecting valve. A main task of the large intestine is to absorb much of the remaining water. Remember, water is present not only in solid foods and beverages, but also the stomach releases a few hundred milliliters of gastric juice, and the pancreas adds approximately 500 milliliters during the digestion of the meal. For the body to conserve water, it is important that excessive water is not lost in fecal matter. In the large intestine, no further chemical or mechanical breakdown of food takes place unless it is accomplished by the bacteria that inhabit this portion of the intestinal tract. The number of bacteria residing in the large intestine is estim

## Creating Embeddings
using sentence transformers all-mpnet-base-v2 model on each chunk and save embeddings

In [22]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.1


In [23]:
from sentence_transformers import  SentenceTransformer
import torch

model = SentenceTransformer("all-mpnet-base-v2",
                            device="cuda" if torch.cuda.is_available() else "cpu")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
sample_sentences = ["This is learing RAG",
                    "Why sky is blue.",
                    "to be or not to be?"]
sample_embeddings = model.encode(sample_sentences)

for sent,emb in zip(sample_sentences,sample_embeddings):
    print(f"Sentence: {sent} \n Embedding: {emb}\n")


Sentence: This is learing RAG 
 Embedding: [ 1.35656400e-02 -4.05066721e-02  1.36057632e-02  3.92129757e-02
  3.16675603e-02 -1.27551369e-02 -2.19453145e-02  2.08696760e-02
 -2.45429892e-02 -1.30132651e-02  9.16951243e-03 -2.29534004e-02
  3.19749080e-02 -8.51907209e-02  4.52552848e-02 -7.22494675e-03
  4.48949113e-02  1.78468216e-03 -3.15136323e-03 -3.51242051e-02
  1.63251292e-02  1.83048453e-02  6.22125389e-03  1.84313599e-02
 -1.61896516e-02 -1.07237492e-02  1.45885972e-02  1.40310097e-02
 -1.98288634e-02  2.00343058e-02  1.25578074e-02 -1.09815923e-02
 -2.65760371e-03  9.31028277e-03  1.89347054e-06  3.10882600e-03
 -8.45359080e-03 -1.99653059e-02 -4.94666770e-02  5.83847389e-02
  5.50031513e-02  9.32039469e-02 -5.20484783e-02  1.68342739e-02
 -2.48698890e-03 -4.85586002e-02  1.12593984e-02  2.39718091e-02
  2.45057940e-02  5.45430183e-02  1.80712212e-02 -9.34032910e-03
  1.38299204e-02 -2.92354785e-02  1.50079474e-01  1.19404891e-03
 -5.37780672e-02  1.22232720e-01  8.64282437e-0

In [25]:
%%time

sample_embeddings.shape

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 9.78 µs


(3, 768)

In [26]:
text_chunks = [item["text"] for item in pages_and_chunks_filtered]

text_chunks[419]

'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations. Add

In [27]:

%%time
# train on CPU
# model.to("cpu")

# train on GPU of intel
model.to("cuda")

# train on GPU of amd
# model.to("rocm") #not working

for item in tqdm(pages_and_chunks_filtered):
    item["embedding"] = model.encode(item["text"])

# nearly 1 item/s

100%|██████████| 1680/1680 [00:31<00:00, 53.76it/s]

CPU times: user 30.4 s, sys: 217 ms, total: 30.6 s
Wall time: 31.3 s





In [28]:
%%time

#embed all text in batches

text_chunk_embeddings = model.encode(text_chunks,
                                     batch_size=32,
                                     convert_to_tensor=True)



CPU times: user 23.7 s, sys: 86 ms, total: 23.8 s
Wall time: 23.9 s


In [29]:
random.sample(pages_and_chunks_filtered,k=2)

[{'page_number': 623,
  'text': 'Age Group RDA Males and Females mcg DFE/day UL Infants (0–6 months) 65* Not possible to determine Infants (7–12 months) 80* Not possible to determine Children (1–3 years) 150 300 Children (4–8 years) 200 400 Children (9–13 years) 300 600 Adolescents (14–18 years) 400 800 Adults (> 19 years) 400 1000 *denotes Adequate Intake Dietary Supplement Fact Sheet: Folate. National Institute of Health, Office of Dietary Supplements. https://ods.od.nih.gov/factsheets/ Folate-HealthProfessional/. Updated April 20, 2016. Accessed October 22, 2017. Dietary Sources Folate is found naturally in a wide variety of food especially in dark leafy vegetables, fruits, and animal products. The U.S. Food and Drug Administration (FDA) began requiring manufacturers to fortify enriched breads, cereals, flours, and cornmeal to increase the consumption of folate in the American diet. For the folate content of various foods, see Table 9.26 “Folate Content of Various Foods”. Table 9.26

Save to file

In [30]:
# 2.15 saving to file
text_chunks_with_embedding = pd.DataFrame(pages_and_chunks_filtered)

embedding_save_path = "text_chunks_with_embedding.csv"

text_chunks_with_embedding.to_csv(embedding_save_path,index=False)


# EMBEDDINGS SAVED CREATE READ MODE

In [5]:
# imports libraries so no need to run above cell
import os
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import requests
import torch

embedding_save_path = "text_chunks_with_embedding.csv"


In [7]:
# if colab then install torch_xla
if 'COLAB_GPU' in os.environ:
    !pip install torch_xla

In [8]:
import torch
import platform
import os

def get_device():
    """make a device variable to automatically detect if GPU(cuda and ryzen both), Colab CPU,GPU,TPU else CPU at end"""
    # Check for CUDA GPU
    if torch.cuda.is_available():
        return torch.device("cuda")
    
    # Check for AMD GPU (ROCm)
    if hasattr(torch, 'has_rocm') and torch.has_rocm:
        return torch.device("rocm")
    
    # Check for Google Colab
    if 'COLAB_GPU' in os.environ:
        if os.environ['COLAB_GPU'] == '1':
            return torch.device("cuda")
        elif 'COLAB_TPU_ADDR' in os.environ:
            import torch_xla.core.xla_model as xm
            return xm.xla_device()
    
    # If none of the above, use CPU
    return torch.device("cpu")

# Get the device
device = get_device()
print(f"Using device: {device}")

# Optional: Print more details about the device
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
elif device.type == "rocm":
    print("AMD GPU detected")
elif device.type == "xla":
    print("TPU detected")
else:
    print(f"CPU: {platform.processor()}")

Using device: cpu
CPU: AMD64 Family 23 Model 24 Stepping 1, AuthenticAMD


In [2]:
# read saved file using path
embedding_file_url = "https://github.com/prathameshks/FileChat-Using-RAG/raw/refs/heads/main/text_chunks_with_embedding.csv"

if(not os.path.exists(embedding_save_path)):
    # download from url
    print(f"Downloading {embedding_save_path}")
    embedding_file = requests.get(embedding_file_url)
    
    with open(embedding_save_path, "wb") as f:
        f.write(embedding_file.content)
        
    print(f"Downloaded {embedding_save_path} and Saved")       
else:
    print("Found Embedding file.")

text_chunks_with_embedding = pd.read_csv(embedding_save_path)

# convert embedding column to np array from string
text_chunks_with_embedding['embedding'] = text_chunks_with_embedding['embedding'].apply(lambda x: np.fromstring(x.strip('[]'),sep=" "))

pages_and_chunks_filtered = text_chunks_with_embedding.to_dict(orient="records") 


Found Embedding file.


In [11]:
# convert embedding column to tensor from object
embeddings = torch.tensor(np.array(text_chunks_with_embedding["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1680, 768])

In [3]:
text_chunks_with_embedding.dtypes

page_number            int64
text                  object
sentence_chunk        object
chunk_char_count       int64
chunk_word_count       int64
chunk_token_count    float64
embedding             object
dtype: object

In [4]:
random.sample(pages_and_chunks_filtered,k=1)

[{'page_number': 437,
  'text': 'Diseases Involving Proteins UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM As you may recall, moderation refers to having the proper amount of a nutrient—having neither too little nor too much. A healthy diet incorporates all nutrients in moderation. Low protein intake has several health consequences, and a severe lack of protein in the diet eventually causes death. Although severe protein deficiency is a rare occurrence in children and adults in the United States, it is estimated that more than half of the elderly in nursing homes are protein-deficient. The Acceptable Macronutrient Distribution Range (AMDR) for protein for adults is between 10 and 35 percent of kilocalories, which is a fairly wide range. The percent of protein in the diet that is associated with malnutrition and its health consequences is less than 10 percent, but this is often accompanied by deficiencies in calories and other micron

In [10]:
from sentence_transformers import SentenceTransformer,util

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device=device)



In [12]:
# create a query and its embedding
query = "Diseases Involving Proteins"
print(f"Query: {query}")
query_embedding = embedding_model.encode(query, convert_to_tensor=True)


Query: Diseases Involving Proteins


In [13]:
%time

# Find the closest 5 sentences of the corpus for query sentence based on cosine similarity
dot_scores = util.dot_score(a=query_embedding,b=embeddings)[0]


# showing top 5 results
top_5_dot_products = torch.topk(dot_scores,k=5)
top_5_dot_products

CPU times: total: 0 ns
Wall time: 2 ms


torch.return_types.topk(
values=tensor([0.7183, 0.6313, 0.6193, 0.6077, 0.5601]),
indices=tensor([586, 587, 589, 599, 590]))

In [14]:
import textwrap

def print_wrap(text,wrap_width=80):
    print(textwrap.fill(text,wrap_width))

In [18]:
# show the data from data frame for top results

print(f"Query: {query} \n Top 5 Results:")

for score,index in zip(top_5_dot_products.values,top_5_dot_products.indices):
    print(f"Score: {score}")
    print("Text:")
    print_wrap(pages_and_chunks_filtered[index]["text"])
    
    print(f"\nPage No. {pages_and_chunks_filtered[index]['page_number']}")
    print("\n")

Query: Diseases Involving Proteins 
 Top 5 Results:
Score: 0.7182703614234924
Text:
Diseases Involving Proteins UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND
HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM As you may recall,
moderation refers to having the proper amount of a nutrient—having neither too
little nor too much. A healthy diet incorporates all nutrients in moderation.
Low protein intake has several health consequences, and a severe lack of protein
in the diet eventually causes death. Although severe protein deficiency is a
rare occurrence in children and adults in the United States, it is estimated
that more than half of the elderly in nursing homes are protein-deficient. The
Acceptable Macronutrient Distribution Range (AMDR) for protein for adults is
between 10 and 35 percent of kilocalories, which is a fairly wide range. The
percent of protein in the diet that is associated with malnutrition and its
health consequences is less than 10 percent, but this is often accom