In [1]:
import os
import requests

Downloading The Document (PDF File)

In [2]:
# get document name
pdf_name = "human-nutrition-text.pdf"

if not os.path.exists(pdf_name):
    print(f"Downloading {pdf_name}")

    # url of document to download

    pdf_url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    resp = requests.get(pdf_url)

    if resp.status_code == 200:
        with open(pdf_name, "wb") as f:
            f.write(resp.content)
        print(f"Downloaded {pdf_name} and Saved")
    else:
        print(f"Failed to download {pdf_name}")
else:
    print(f"{pdf_name} already exists")


human-nutrition-text.pdf already exists


Open the PDF

In [3]:
!pip install PyMuPDF



In [4]:
!pip install spacy



In [5]:
import fitz  # PyMuPDF !pip install pymupdf
from tqdm.auto import tqdm

def text_formatter(text:str)->str:
    """perform basic text preprocessing. """

    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text


def open_and_read_pdf(pdf_name:str)->list[dict]:
    doc = fitz.open(pdf_name)
    pages_and_textx = []

    for page_no,page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_textx.append({
            "page_number" : page_no+1,
            "page_char_count" : len(text),
            "page_word_count" : len(text.split()),
            "page_sentence_count_raw" : len(text.split(". ")),
            "page_token_count" : len(text)/4,
            "text" : text
            })
    return pages_and_textx

pages_and_texts = open_and_read_pdf(pdf_name)

  from .autonotebook import tqdm as notebook_tqdm
1208it [00:04, 265.74it/s]


In [6]:
import random

random.sample(pages_and_texts,k=3)

[{'page_number': 265,
  'page_char_count': 2161,
  'page_word_count': 330,
  'page_sentence_count_raw': 18,
  'page_token_count': 540.25,
  'text': 'drinking sports drinks instead of plain water enhances endurance  or performance in individuals exercising less than one hour and at  low to moderate intensities. A well-concocted sports drink contains  sugar, water, and sodium in the correct proportions so that  hydration is optimized. The sugar is helpful in maintaining blood- glucose levels needed to fuel muscles, the water keeps an athlete  hydrated, and the sodium enhances fluid absorption and replaces  some of that lost in sweat. The American College of Sports Medicine  states that the goal of drinking fluids during exercise is to prevent  dehydration, which compromises performance and endurance.  The primary source of water loss during intense physical activity  is sweat. Perspiration rates are variable and dependent on many  factors including body composition, humidity, temperature

In [7]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition
1,2,0,0,1,0.0,
2,3,320,42,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,4,212,30,1,53.0,Human Nutrition: 2020 Edition by University of...
4,5,797,116,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [8]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035
std,348.86387,560.382275,86.491465,6.187226,140.095569
min,1.0,0.0,0.0,1.0,0.0
25%,302.75,762.0,109.0,4.0,190.5
50%,604.5,1231.5,183.0,10.0,307.875
75%,906.25,1603.5,239.0,14.0,400.875
max,1208.0,2308.0,393.0,32.0,577.0


Why we need token Count?

1. Embeding Model don't deal with infinite tokens.
2. LLMs don't deal with lnfinite tokens.

# Splitting Pages into sentences

Two days to do this
1. split by `". "` OR `".\n"`
2. Use NLP like spaCy or NLTK

In [9]:
from spacy.lang.en import English 

nlp = English()

# add a sentencilizer pipeline 
nlp.add_pipe("sentencizer")

# create a document example
doc = nlp("THis is sentence1. I like cats")

list(doc.sents)

In [14]:
pages_and_texts[600]

{'page_number': 601,
 'page_char_count': 863,
 'page_word_count': 117,
 'page_sentence_count_raw': 8,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.\xa0Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.  \xa0https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitami

In [16]:
from tqdm import tqdm

for page in tqdm(pages_and_texts):
    page["sentences"] = [str(sent) for sent in nlp(page['text']).sents]
    
    page["page_sentence_count"] = len(page["sentences"])

100%|██████████| 1208/1208 [00:06<00:00, 183.88it/s]


In [20]:
pages_and_texts[103]

{'page_number': 104,
 'page_char_count': 1693,
 'page_word_count': 264,
 'page_sentence_count_raw': 13,
 'page_token_count': 423.25,
 'text': 'Basic Biology, Anatomy, and  Physiology  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The Basic Structural and Functional Unit of Life:  The Cell  What distinguishes a living\xa0organism from an inanimate object? A  living organism conducts self-sustaining biological processes. A cell  is the smallest and most basic form of life.  The cell theory incorporates three principles:  Cells are the most basic building units of life.\xa0All living things  are composed of cells. New cells are made from preexisting cells,  which divide in two. Who you are has been determined because  of two cells that came together inside your mother’s womb. The  two cells containing all of your genetic information (DNA) united to  begin making new life. Cells divided and differentiated into other  cells with specif

In [21]:
df = pd.DataFrame(pages_and_texts)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035,10.319536
std,348.86387,560.382275,86.491465,6.187226,140.095569,6.300843
min,1.0,0.0,0.0,1.0,0.0,0.0
25%,302.75,762.0,109.0,4.0,190.5,5.0
50%,604.5,1231.5,183.0,10.0,307.875,10.0
75%,906.25,1603.5,239.0,14.0,400.875,15.0
max,1208.0,2308.0,393.0,32.0,577.0,28.0


## Chunking the sentences together

here we are getting in 10 sentences at a time.

Framework like langchain can do this task easily.

## Need of Chunking
1. makes easy to filter 
2. context of embedding model 
2. context of llm

In [23]:
# define number of sentences chunk
sentences_chunk_size = 10

# actual function to split list of sentences to list of list of size <= 10 of sentences

def split_sentence_chunk(sentence_list:list[str],size_chunk:int = sentences_chunk_size) -> list[list[str]]:
    return [sentence_list[i:i + size_chunk] for i in range(0,len(sentence_list),size_chunk)]

test  = list(range(1,26))
print(test)
split_sentence_chunk(test)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [21, 22, 23, 24, 25]]

In [24]:
# make chunks of original data
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_sentence_chunk(item["sentences"],sentences_chunk_size)
    item["chunk_count"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 4013.06it/s]


In [26]:
random.sample(pages_and_texts,k=1)

[{'page_number': 580,
  'page_char_count': 1444,
  'page_word_count': 207,
  'page_sentence_count_raw': 12,
  'page_token_count': 361.0,
  'text': 'Cardiovascular Disease  Vitamin E reduces the oxidation of LDLs, and it was therefore  hypothesized that vitamin E supplements would protect against  atherosclerosis. However, large clinical trials have not consistently  found evidence to support this hypothesis. In fact, in the “Women’s  Angiographic Vitamin and Estrogen Study,” postmenopausal women  who took 400 international units (264 milligrams) of vitamin E and  500 milligrams of vitamin C twice per day had higher death rates  from all causes.7  Other studies have not confirmed the association between  increased vitamin E intake from supplements and increased  mortality. There is more consistent evidence from observational  studies that a higher intake of vitamin E from foods is linked to a  decreased risk of dying from a heart attack.  Cancer  The large clinical trials that evaluated

In [27]:
df=pd.DataFrame(pages_and_texts)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count,chunk_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.004139,171.96606,9.972682,287.001035,10.319536,1.525662
std,348.86387,560.382275,86.491465,6.187226,140.095569,6.300843,0.644397
min,1.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,302.75,762.0,109.0,4.0,190.5,5.0,1.0
50%,604.5,1231.5,183.0,10.0,307.875,10.0,1.0
75%,906.25,1603.5,239.0,14.0,400.875,15.0,2.0
max,1208.0,2308.0,393.0,32.0,577.0,28.0,3.0


## Splitting chunks to separate items
so that we can add meta data to each chunk ans can embed them together.


In [29]:
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for chunk in item["sentence_chunks"]:
        chunk_dict = {}
            
        chunk_dict["page_number"] = item["page_number"] # original page number
        chunk_dict["text"] =  " ".join(chunk).replace("  "," ").strip() # original text
        chunk_dict["sentence_chunk"] = chunk
        chunk_dict["chunk_char_count"]  =  len(chunk_dict["text"])
        chunk_dict["chunk_word_count"]  =  len([word for word in chunk_dict["text"].split() if word])
        chunk_dict["chunk_token_count"] = chunk_dict["chunk_char_count"] / 4
        
        pages_and_chunks.append(chunk_dict)
        
len(pages_and_chunks)      

  0%|          | 0/1208 [00:00<?, ?it/s]

100%|██████████| 1208/1208 [00:00<00:00, 4467.75it/s]


1843

In [33]:
random.sample(pages_and_chunks,k=1)

[{'page_number': 210,
  'text': 'Image by Allison Calabrese / CC BY 4.0 The Kidneys Detect Blood Volume The kidneys are two bean-shaped organs, each about the size of a fist and located on either side of the spine just below the rib cage. The kidneys filter about 190 liters of blood and produce (on average) 1.5 liters of urine per day. Urine is mostly water, but it also contains electrolytes and waste products, such as urea. The amount of water filtered from the blood and excreted as urine is dependent on the amount of water in, and the electrolyte composition in the blood. Kidneys have protein sensors that detect blood volume from the pressure, or stretch, in the blood vessels of the kidneys. When blood volume is low, kidney cells detect decreased pressure and secrete the enzyme, renin. Renin travels in the blood and cleaves another protein into the active hormone, angiotensin. Angiotensin targets three different organs (the adrenal glands, the hypothalamus, and 168 | Regulation of Wa

In [34]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,625.38,735.11,112.85,183.78
std,347.79,447.64,71.28,111.91
min,1.0,12.0,3.0,3.0
25%,322.5,315.0,45.0,78.75
50%,628.0,747.0,115.0,186.75
75%,932.0,1119.0,173.0,279.75
max,1208.0,1832.0,298.0,458.0


# remove chunks with less than 30 token

In [44]:
min_chunk_token_limit = 30

pages_and_chunks_filtered = df[df['chunk_token_count'] > min_chunk_token_limit].to_dict(orient="records")

In [60]:
random.sample(pages_and_chunks_filtered,k=1)

[{'page_number': 643,
  'text': 'Phytochemical Phytochemical Source Phytochemical Function: Carotenoid Yellow-orange fruits, dark green leafy vegetables May possess strong cancer-fighting properties Indoles Cruciferous vegetables (i.e. bok choy, broccoli, choy sum) May inhibit the development of cancer-causing hormones and prevent tumor growth Phytoestrogen Grapes, berries, plums, soybeans, tofu, garlic May lower the risk for osteoporosis, heart disease, breast cancer, and menopausal symptoms Stanols Grains, nuts, legumes May lower blood cholesterol levels and reduce the risk of heart disease and stroke Saponins Broad beans, kidney beans, lentils May decrease blood lipids, lower cancer risks, and lower blood glucose response Terpenes Citrus fruits May slow cancer cell growth, aid in immune system support, and prevent virus related illness Flavonoids Fruits, vegetables, chocolates, wines, teas, nuts, seeds May benefit the immune system and prevent cancer cell growth. Anthocyanidins Frui

## Creating Embeddings 
using sentence transformers all-mpnet-base-v2 model on each chunk and save embeddings

In [62]:
from sentence_transformers import  SentenceTransformer
import torch

model = SentenceTransformer("all-mpnet-base-v2",
                            device="cuda" if torch.cuda.is_available() else "cpu")



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [64]:
sample_sentences = ["This is learing RAG",
                    "Why sky is blue.",
                    "to be or not to be?"]
sample_embeddings = model.encode(sample_sentences)

for sent,emb in zip(sample_sentences,sample_embeddings):
    print(f"Sentence: {sent} \n Embedding: {emb}\n")
    

Sentence: This is learing RAG 
 Embedding: [ 1.35656791e-02 -4.05067690e-02  1.36057837e-02  3.92130762e-02
  3.16675417e-02 -1.27552049e-02 -2.19453555e-02  2.08696797e-02
 -2.45430116e-02 -1.30132865e-02  9.16955620e-03 -2.29533985e-02
  3.19748595e-02 -8.51906538e-02  4.52553146e-02 -7.22490204e-03
  4.48949300e-02  1.78472849e-03 -3.15134902e-03 -3.51242088e-02
  1.63252503e-02  1.83048286e-02  6.22122409e-03  1.84313096e-02
 -1.61895938e-02 -1.07237790e-02  1.45885479e-02  1.40310153e-02
 -1.98288355e-02  2.00342536e-02  1.25578595e-02 -1.09816156e-02
 -2.65763141e-03  9.31034982e-03  1.89346906e-06  3.10881250e-03
 -8.45358893e-03 -1.99653208e-02 -4.94666547e-02  5.83847091e-02
  5.50031178e-02  9.32038501e-02 -5.20485118e-02  1.68342981e-02
 -2.48690392e-03 -4.85586561e-02  1.12594273e-02  2.39719488e-02
  2.45058648e-02  5.45430481e-02  1.80712435e-02 -9.34020896e-03
  1.38298152e-02 -2.92354673e-02  1.50079578e-01  1.19415391e-03
 -5.37781380e-02  1.22232631e-01  8.64283647e-0

In [68]:
%%time

sample_embeddings.shape

CPU times: total: 0 ns
Wall time: 2 ms


(3, 768)

In [76]:
text_chunks = [item["text"] for item in pages_and_chunks_filtered]

text_chunks[419]

'often. • Calm your “sweet tooth” by eating fruits, such as berries or an apple. • Replace sugary soft drinks with seltzer water, tea, or a small amount of 100 percent fruit juice added to water or soda water. The Food Industry: Functional Attributes of Carbohydrates and the Use of Sugar Substitutes In the food industry, both fast-releasing and slow-releasing carbohydrates are utilized to give foods a wide spectrum of functional attributes, including increased sweetness, viscosity, bulk, coating ability, solubility, consistency, texture, body, and browning capacity. The differences in chemical structure between the different carbohydrates confer their varied functional uses in foods. Starches, gums, and pectins are used as thickening agents in making jam, cakes, cookies, noodles, canned products, imitation cheeses, and a variety of other foods. Molecular gastronomists use slow- releasing carbohydrates, such as alginate, to give shape and texture to their fascinating food creations. Add

In [None]:

%%time
# train on CPU
model.to("cpu")

# train on GPU of intel 
# model.to("cuda")

# train on GPU of amd 
# model.to("rocm") #not working

for item in tqdm(pages_and_chunks_filtered):
    item["embedding"] = model.encode(item["text"])
    
# nearly 1 item/s 

In [77]:
%%time

#embed all text in batches

text_chunk_embeddings = model.encode(text_chunks,
                                     batch_size=32,
                                     convert_to_tensor=True)



KeyboardInterrupt: 

In [None]:
2.15 saving to file start