## Simple Local RAG 

In [2]:
import os
import requests
from tqdm import tqdm
import fitz

### Download book in pdf format

In [3]:
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print('Downloading...')

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    if response.status_code == 200:
        with open(pdf_path, 'wb') as file, tqdm(
            desc=pdf_path,
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(chunk_size=1024):
                file.write(data)
                bar.update(len(data))
        print("FILE DOWNLOADED")
    else: 
        print(f"Failed to download the file: {response.status_code}")

### Extract data from the pdfs into a list[dict]

In [4]:
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({
            "page_number": page_number - 41,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

1208it [00:05, 221.49it/s]


In [5]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)

df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


### Further text processing (splitting into sentences)

In [6]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like Elephants.")
assert len(list(doc.sents)) == 3
list(doc.sents)

[This is a sentence., This another sentence., I like Elephants.]

In [7]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:05<00:00, 204.00it/s]


In [8]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### Splitting sentences into chunks

In [10]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str], slice_size: int) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list, 10)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [11]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 193615.32it/s]


In [19]:
import random
random.sample(pages_and_texts, k=1)

[{'page_number': 529,
  'page_char_count': 1273,
  'page_word_count': 262,
  'page_sentence_count': 21,
  'page_token_count': 318.25,
  'text': 'Food  Serving Vitamin A (IU) Percent Daily Value  Beef liver  3 oz.  27,185  545  Chicken liver  3 oz.  12,325  245  Milk, skim  1 c.  500  10  Milk, whole  1 c.  249  5  Cheddar cheese 1 oz.  284  6  Source: Dietary Supplement Fact Sheet: Vitamin A. National  Institutes  of  Health,  Office  of  Dietary  Supplements.  http://ods.od.nih.gov/factsheets/VitaminA-QuickFacts/. Updated  September 5, 2012. Accessed October 7, 2017.  In the United States, the most consumed carotenoids are alpha- carotene, beta-carotene, beta-cryptoxanthin, lycopene, lutein, and  zeaxanthin. See Table 9.3 “Alpha- and Beta-Carotene Content of  Various Foods” for the carotenoid content of various foods.  Table 9.3 Alpha- and Beta-Carotene Content of Various Foods  Food  Serving  Beta-carotene  (mg)  Alpha-carotene  (mg)  Pumpkin, canned  1c.  17.00  11.70  Carrot juice 

In [20]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item

In [25]:
import re
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk 
        chunk_dict["chunk_char_count"] = len([word for word in chunk_dict["sentence_chunk"].split(" ")]) 
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 16278.46it/s]


1843

In [26]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 42,
  'sentence_chunk': 'Type Description Example Notes Epidemiological Observational study of populations around the world and the impact of nutrition on health. Diets with a high consumption of saturated fat are associated with an increased risk of heart attacks. Does not determine cause-and-effect relationships. Intervention Clinical Trials Scientific investigations where a variable is changed between groups. Testing the effect of different diets on blood pressure. One group consumes an American diet, group 2 eats a diet rich in fruits and vegetables, and group 3 eats a combination of groups 1 and 2. If done correctly, it does determine cause-and-effect relationships. Randomized Clinical Trials Participants are assigned by chance to separate groups that compare different treatments. Neither the researchers nor the participants can choose which group a participant is assigned. Testing the effect of calcium supplements on women with osteoporosis.',
  'chunk_char_count

In [29]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_token_count
count,1843.0,1843.0,1843.0
mean,583.38,112.33,183.61
std,347.79,71.22,111.89
min,-41.0,3.0,3.0
25%,280.5,44.0,78.75
50%,586.0,114.0,186.5
75%,890.0,173.0,279.62
max,1166.0,297.0,457.75


### Filter chunks of text for short chunks

In [36]:
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 25.25 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=521   996 | The Major Types of Foodborne Illness
Chunk token count: 11.75 | Text: Accessed March 17, 2018. Sports Nutrition | 961
Chunk token count: 9.25 | Text: 490 | Factors Affecting Energy Intake
Chunk token count: 28.25 | Text: A concentration gradient is a form of potential energy, like water 172 | Electrolytes Important for Fluid Balance
Chunk token count: 4.25 | Text: Introduction | 61


In [37]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 30,
  'chunk_token_count': 52.5}]

## Embedding Chunks!!!