In [5]:
import pandas as pd
import os
import requests

In [6]:
pdf_path = "/home/pranjal/Downloads/RAG_from_scratch/Human-Nutrition-2020-Edition-1598491699.pdf"

# Download the PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading now")

  url = "https://github.com/pranzalkhadka/RAG_from_scratch/raw/main/college_data.pdf"
  #url = "https://github.com/mrdbourke/simple-local-rag/blob/raw/main/human-nutrition-text.pdf"
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as college-information.pdf


In [9]:
import fitz
from tqdm.auto import tqdm

In [10]:
def text_cleaner(text):
    # Replace newline characters with spaces and strip leading/trailing whitespaces
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

In [15]:
def read_pdf(pdf_path):
    # Open the PDF using PyMuPDF
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        # Get the text of the page
        text = page.get_text()
        # Clean the text
        text = text_cleaner(text)
        # Append the text to the list with other information
        pages_and_text.append({
            # page number to find where the infrmation came from
            "page_number": page_number-41,
            # length of the text
            "page_char_count": len(text),
            # number of words in the text
            "page_word_count": len(text.split()),
            # number of sentences in the text
            "page_sentence_count_raw": len(text.split(". ")),
            # number of tokens in the text
            "page_token_count": len(text)/4,
            "text": text
        })

    return pages_and_text

In [16]:
pdf_path = "/home/pranjal/Downloads/RAG_from_scratch/Human-Nutrition-2020-Edition-1598491699.pdf"

In [17]:
pages_and_text = read_pdf(pdf_path)
pages_and_text[:2]

1208it [00:00, 1373.93it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 0,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [18]:
import random
random.sample(pages_and_text, k=3)

[{'page_number': 491,
  'page_char_count': 764,
  'page_word_count': 104,
  'page_sentence_count_raw': 5,
  'page_token_count': 191.0,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.  These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.    An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=310    Factors Affecting Energy Intake  |  491'},
 {'page_number': 111,
  'page_char_count': 683,
  'page_word_count': 126,
  'page_sent

In [19]:
df = pd.DataFrame(pages_and_text)

In [20]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,0,1,0.0,
2,-39,320,42,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,30,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,116,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [21]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035
std,348.86387,560.382275,86.491465,6.548495,140.095569
min,-41.0,0.0,0.0,1.0,0.0
25%,260.75,762.0,109.0,5.0,190.5
50%,562.5,1231.5,183.0,10.0,307.875
75%,864.25,1603.5,239.0,15.0,400.875
max,1166.0,2308.0,393.0,39.0,577.0


In [22]:
from spacy.lang.en import English

In [23]:
nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence. This is a third sentence.")

assert len(list(doc.sents)) == 3

print(list(doc.sents))

[This is a sentence., This is another sentence., This is a third sentence.]


In [24]:
for item in tqdm(pages_and_text):

    # Parse the text with spaCy
    item["sentences"] = list(nlp(item["text"]).sents)

    # Convert all sentences to strings because default type is a spacy data type
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the number of sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:01<00:00, 815.23it/s]


In [25]:
random.sample(pages_and_text, k=1)

[{'page_number': 1103,
  'page_char_count': 1480,
  'page_word_count': 239,
  'page_sentence_count_raw': 16,
  'page_token_count': 370.0,
  'text': 'beneficial, as well as consuming more soy products. It is also  important to maintain a healthy weight and avoid smoking or  chewing tobacco.  Hypertension  Chronic high blood pressure, also known as hypertension, is a  significant health hazard affecting one out of three adults in the  United States.3 This chronic condition is a major cause of heart  attacks and strokes, yet it has no symptoms until blood pressure  reaches very high levels, which is why it is known as “the silent  killer.” The only way to find out if you have high blood pressure is to  get an accurate reading of your resting blood pressure rate, which  is best done by a medical professional and should be monitored  regularly.  High blood pressure is such an important factor in cardiovascular  disease, that keeping it within a healthy range is vitally important.  Blood pre

In [26]:
df = pd.DataFrame(pages_and_text)

In [27]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035,10.319536
std,348.86387,560.382275,86.491465,6.548495,140.095569,6.300843
min,-41.0,0.0,0.0,1.0,0.0,0.0
25%,260.75,762.0,109.0,5.0,190.5,5.0
50%,562.5,1231.5,183.0,10.0,307.875,10.0
75%,864.25,1603.5,239.0,15.0,400.875,15.0
max,1166.0,2308.0,393.0,39.0,577.0,28.0


In [29]:
# break down our list of sentences into smaller chunks
# This is because mbedding models have some fixed capacity for tokens and exceeding that will cause information loss
num_sentence_chunk_size = 10

def split_list(lst, slice_size = num_sentence_chunk_size):
    # Split the list into chunks of size of 10
    return [lst[i:i + slice_size] for i in range(0, len(lst), slice_size)]

In [31]:
for item in tqdm(pages_and_text):

    # Split the sentences into chunks
    item["sentence_chunks"] = split_list(item["sentences"], slice_size=num_sentence_chunk_size)

    # Count the number of chunks
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 384308.19it/s]


In [33]:
random.sample(pages_and_text, k=1)

[{'page_number': 886,
  'page_char_count': 1346,
  'page_word_count': 212,
  'page_sentence_count_raw': 11,
  'page_token_count': 336.5,
  'text': 'who regularly participate in sports or exercise need to eat a greater  number of calories to account for increased energy expenditures.  For carbohydrates, the AMDR is 45 to 65 percent of daily calories  (which is a recommended daily allowance of 158–228 grams for  1,400–1,600 daily calories). Carbohydrates that are high in fiber  should make up the bulk of intake. The AMDR for protein is 10 to 30  percent of daily calories (35–105 grams for 1,400 daily calories for  girls and 40–120 grams for 1,600 daily calories for boys). The AMDR  for fat is 25 to 35 percent of daily calories (39–54 grams for 1,400  daily calories for girls and 44–62 grams for 1,600 daily calories for  boys), depending on caloric intake and activity level.  Micronutrients  Key vitamins needed during puberty include vitamins D, K, and  B12. Adequate calcium intake is ess

In [34]:
df = pd.DataFrame(pages_and_text)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,171.96606,10.519868,287.001035,10.319536,1.525662
std,348.86387,560.382275,86.491465,6.548495,140.095569,6.300843,0.644397
min,-41.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,109.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,183.0,10.0,307.875,10.0,1.0
75%,864.25,1603.5,239.0,15.0,400.875,15.0,2.0
max,1166.0,2308.0,393.0,39.0,577.0,28.0,3.0


In [35]:
import re

In [42]:
# Create a new list of dictionaries each containing a single chunk of sentences with relative information

pages_and_chunks = []
# For each page
for item in tqdm(pages_and_text):
    # For each sentence chunk
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict= {}
        # Add the page number
        chunk_dict["page_number"] = item["page_number"]
        # concatenate the elements of the sentence_chunk list into a single string separated by spaces and then remove any leading/trailing whitespace from the resulting string
        joined_sentence_chunk = " ".join(sentence_chunk).replace(" ", " ").strip()
        # If there is .(capital letter) then add space after the full stop and then the capital letter
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)

100%|██████████| 1208/1208 [00:00<00:00, 54736.29it/s]


In [43]:
len(pages_and_chunks)

1843

In [46]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 913,
  'sentence_chunk': 'Middle Age  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Middle age is defined as the period from age thirty-one to fifty. The  early period of this stage is very different from the end. For example,  during the early years of middle age, many women experience  pregnancy, childbirth, and lactation. In the latter part of this life  stage, women face perimenopause, which is a transition period that  leads up to menopause, or the end of menstruation. A number of  physical changes take place in the middle-aged years, including the  loss of bone mass in women due to dropping levels of estrogen  during menopause. In both men and women, visual acuity declines,  and by age forty there can be a decreased ability to see objects  at a close distance, a condition known as presbyopia.1 All of these  are signs of aging, as the human body begins to change in subtle  and not-so-subtle ways. However, a m

In [47]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.381443,752.14433,130.788388,188.036082
std,347.78867,456.235348,80.556195,114.058837
min,-41.0,14.0,4.0,3.5
25%,280.5,322.5,54.0,80.625
50%,586.0,765.0,134.0,191.25
75%,890.0,1139.5,198.0,284.875
max,1166.0,1871.0,416.0,467.75


In [48]:
# For text with small number of tokens, we will remove them because they wont contain much useful information but take extra compute
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 11.0 | Text: 442  |  Health Consequences of Alcohol Abuse
Chunk token count: 25.75 | Text: http://www.ajcn.org/content/87/1/64.long. Accessed  September 22, 2017.  554  |  Water-Soluble Vitamins
Chunk token count: 4.0 | Text: 190  |  Chloride
Chunk token count: 13.75 | Text: Accessed October 5, 2017.  540  |  Fat-Soluble Vitamins
Chunk token count: 10.25 | Text: Older Adulthood: The Golden Years  |  925


In [49]:
# Filter rows under min token length
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0}]

In [57]:
random.sample(pages_and_chunks_over_min_token_len, k =1)

[{'page_number': 368,
  'sentence_chunk': 'reactions that can be summarized into three basic steps:  transcription, translation, and protein folding. The first step in  constructing a protein is the transcription (copying) of the genetic  information in double-stranded deoxyribonucleic acid (DNA) into  the single-stranded, messenger macromolecule ribonucleic acid  (RNA). RNA is chemically similar to DNA, but has two differences;  one is that its backbone uses the sugar ribose and not deoxyribose;  and two, it contains the nucleotide base uracil, and not thymidine.  The RNA that is transcribed from a given piece of DNA contains the  same information as that DNA, but it is now in a form that can be  read by the cellular protein manufacturer known as the ribosome.  Next, the RNA instructs the cells to gather all the necessary amino  acids and add them to the growing protein chain in a very specific  order. This process is referred to as translation. The decoding of  genetic information to