## Simple Local RAG 

In [1]:
import os
import requests
from tqdm import tqdm
import fitz

### Download book in pdf format

In [2]:
pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print('Downloading...')

    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    if response.status_code == 200:
        with open(pdf_path, 'wb') as file, tqdm(
            desc=pdf_path,
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(chunk_size=1024):
                file.write(data)
                bar.update(len(data))
        print("FILE DOWNLOADED")
    else: 
        print(f"Failed to download the file: {response.status_code}")

### Extract data from the pdfs into a list[dict]

In [3]:
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({
            "page_number": page_number - 41,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

1208it [00:05, 215.55it/s]


In [4]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)

df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


### Further text processing (splitting into sentences)

In [5]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like Elephants.")
assert len(list(doc.sents)) == 3
list(doc.sents)

[This is a sentence., This another sentence., I like Elephants.]

In [6]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:06<00:00, 194.33it/s]


In [7]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### Splitting sentences into chunks

In [8]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str], slice_size: int) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list, 10)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 244355.88it/s]


In [10]:
import random
random.sample(pages_and_texts, k=1)

[{'page_number': 588,
  'page_char_count': 590,
  'page_word_count': 127,
  'page_sentence_count': 11,
  'page_token_count': 147.5,
  'text': 'Food  Serving Choline (mg) Percent Daily Value  Egg  1 large  147  27  Soybeans  ½ cup  107  19  Chicken breast  3 oz.  72  13  Mushrooms, shiitake ½ c.  58  11  Potatoes  1 large  57  10  Kidney beans  ½ c.  45  8  Peanuts  ¼ c.  24  4  Brown rice  1 c.  19  3  Fact Sheet for Health Professionals: Choline. National Institute of  Health, Office of Dietary Supplements. https://ods.od.nih.gov/ factsheets/Choline-HealthProfessional/. Updated January 25, 2017.  Accessed October 28, 2017.  Summary of Water-Soluble Vitamins  Table 9.31 Water-Soluble vitamins  588  |  Water-Soluble Vitamins',
  'sentences': ['Food  Serving Choline (mg) Percent Daily Value  Egg  1 large  147  27  Soybeans  ½ cup  107  19  Chicken breast  3 oz.',
   ' 72  13  Mushrooms, shiitake ½ c.  58  11  Potatoes  1 large  57  10  Kidney beans  ½ c.  45  8  Peanuts  ¼ c.  24  4  Bro

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item

In [12]:
import re
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk 
        chunk_dict["chunk_char_count"] = len([word for word in chunk_dict["sentence_chunk"].split(" ")]) 
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 15039.68it/s]


1843

In [13]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 602,
  'sentence_chunk': 'Cruciferous Vegetables and Human Cancer Risk: Epidemiologic Evidence and Mechanistic Basis. Pharmacological Research\u202f: The Official Journal of the Italian Pharmacological Society, 55(3), 224–236. https://doi.org/10.1016/j.phrs.2007.01.009 Kozłowska, A., & Szostak-Wegierek, D. (2014). Flavonoids—Food sources and health benefits. Roczniki Panstwowego Zakladu Higieny, 65(2), 79–85. Patisaul, H. B., & Jefferson, W. (2010). The pros and cons of phytoestrogens. Frontiers in Neuroendocrinology, 31(4), 400–419. https://doi.org/10.1016/j.yfrne.2010.03.003 Phenolic Acids—An overview | ScienceDirect Topics. (n.d.).',
  'chunk_char_count': 67,
  'chunk_token_count': 152.75}]

In [14]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_token_count
count,1843.0,1843.0,1843.0
mean,583.38,112.33,183.61
std,347.79,71.22,111.89
min,-41.0,3.0,3.0
25%,280.5,44.0,78.75
50%,586.0,114.0,186.5
75%,890.0,173.0,279.62
max,1166.0,297.0,457.75


### Filter chunks of text for short chunks

In [15]:
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 16.5 | Text: http://www.aafp.org/afp/ 2002/1001/p1217.html. 862 | Toddler Years
Chunk token count: 15.25 | Text: Accessed November 30, 2017. Discovering Nutrition Facts | 737
Chunk token count: 28.25 | Text: A concentration gradient is a form of potential energy, like water 172 | Electrolytes Important for Fluid Balance
Chunk token count: 26.25 | Text: Updated November 6, 2015. Accessed April 15, 2018. 1122 | Undernutrition, Overnutrition, and Malnutrition
Chunk token count: 3.0 | Text: 184 | Sodium


In [16]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 30,
  'chunk_token_count': 52.5}]

## Embedding Chunks!!!

In [17]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")

sentences = [
                "The sentence Transformer library provides an easy way to create embeddings.",
                "Sentences can be embedded one by one or in a list.",
                "I like horses!"
            ]


embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embeddings: {embedding}")
    print("")

  from tqdm.autonotebook import tqdm, trange


Sentence: The sentence Transformer library provides an easy way to create embeddings.
Embeddings: [-3.44286375e-02  2.95328815e-02 -2.33643427e-02  5.57257496e-02
 -2.19098609e-02 -6.47062203e-03  1.02848457e-02 -6.57803863e-02
  2.29717735e-02 -2.61121057e-02  3.80420350e-02  5.61403222e-02
 -3.68746594e-02  1.52787790e-02  4.37020473e-02 -5.19723371e-02
  4.89479862e-02  3.58104147e-03 -1.29751097e-02  3.54387122e-03
  4.23262641e-02  3.52606587e-02  2.49402281e-02  2.99177002e-02
 -1.99382380e-02 -2.39752773e-02 -3.33367917e-03 -4.30450514e-02
  5.72014526e-02 -1.32517833e-02 -3.54477987e-02 -1.13935936e-02
  5.55561110e-02  3.61099187e-03  8.88527040e-07  1.14027122e-02
 -3.82229425e-02 -2.43548071e-03  1.51314372e-02 -1.32699206e-04
  5.00659943e-02 -5.50876483e-02  1.73444841e-02  5.00959158e-02
 -3.75959277e-02 -1.04463594e-02  5.08322380e-02  1.24861132e-02
  8.67377296e-02  4.64143082e-02 -2.10690107e-02 -3.90251614e-02
  1.99698494e-03 -1.42345531e-02 -1.86794791e-02  2.82669

In [18]:
embeddings[0].shape

(768,)

In [19]:
embedding = embedding_model.encode("Anything basically")
embedding

array([ 3.81860510e-02, -1.22656384e-02,  1.15295909e-02, -4.88636941e-02,
       -5.83447218e-02,  2.13269889e-02, -7.40920380e-02,  6.19836077e-02,
       -8.72389674e-02, -3.66892479e-02,  1.95006896e-02,  1.45205911e-02,
        3.36535908e-02,  6.56199977e-02,  3.98173667e-02, -5.13510257e-02,
       -1.65609214e-02, -3.47246341e-02, -4.11265045e-02, -6.03969581e-02,
       -3.42106856e-02, -5.16315661e-02,  9.17286985e-03, -3.60094346e-02,
       -1.10503740e-03, -4.10741381e-02, -2.79709194e-02,  1.65025964e-02,
        6.77869236e-03,  8.15290958e-03, -5.59344925e-02, -4.61462252e-02,
        6.43836148e-03, -4.03254069e-02,  1.66068867e-06, -1.16471248e-02,
        1.70526374e-02,  8.01596884e-03,  9.13095661e-03,  4.61028032e-02,
        1.63893984e-03, -3.66453342e-02,  2.31296793e-02,  3.40600535e-02,
        3.01856291e-03, -6.17218576e-02,  1.06378375e-02,  4.06729132e-02,
        7.14887027e-03,  4.99239638e-02, -1.51328472e-02, -1.44329300e-04,
       -3.27568874e-02, -

In [20]:
embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 1680/1680 [00:56<00:00, 29.55it/s]


In [21]:
parallel_comp = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

text_chunk_embedding = embedding_model.encode(parallel_comp, batch_size=32, convert_to_tensor=True)

text_chunk_embedding

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

### Save embeddings to file