# Importing Libraries

In [None]:
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import os
import requests
import random
from spacy.lang.en import English
import re
import fitz
from tqdm.auto import tqdm
import random
import torch
import numpy as np

# Downloading the pdf file
The below code checks whether a PDF file named "human-nutrition-text.pdf" exists in the current directory. If the file does not exist, it downloads the PDF from a specified URL (https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf) using the requests library. It sends a GET request to the URL and, upon receiving a successful response (HTTP status code 200), saves the file locally with the specified filename in binary write mode. If the file already exists, it skips the download process and notifies the user.

In [3]:
# Get PDF document
pdf_path = "human-nutrition-text.pdf"
# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
  #url="https://books.underscore.io/essential-scala/essential-scala.pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


# Text_Formatter

The text_formatter function takes a string input (text) and performs minor formatting. Specifically, it removes all newline characters (\n) from the text by replacing them with spaces and then removes any leading or trailing whitespace using the strip() method. Finally, it returns the cleaned and formatted text. This is useful for preparing raw text data for further processing or display.


In [4]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

# open_and_read_pdf
The open_and_read_pdf function reads a PDF file from the specified pdf_path and extracts structured information from each page. It uses the fitz library (from PyMuPDF) to open the PDF and process its pages. Here's what the function does step by step:

Open the PDF: The fitz.open(pdf_path) method opens the PDF document.

Initialize an empty list: pages_and_texts is used to store information about each page.

Iterate through each page: Using enumerate(doc), it loops through all pages in the PDF.

Extract text from the page: The page.get_text() method retrieves plain UTF-8 encoded text from the page.

Format the text: The text is cleaned using the text_formatter function to remove newlines and extra spaces.

Calculate and store page details:

page_number: Adjusted by subtracting 41, assuming the actual content starts from page 42.

page_char_count: Total characters in the page's text.

page_word_count: Total words in the text (split by spaces).

page_sentence_count_raw: Estimated number of sentences by splitting on ". " (simple heuristic).

page_token_count: Approximate number of tokens, calculated by dividing the character count by 4 (a token is estimated to be ~4 characters).

text: The cleaned page text.

Append to the list: A dictionary containing all the above details is appended to pages_and_texts.

Return the result: The function returns a list of dictionaries, with each dictionary representing a page and its corresponding information.

In [5]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

In [6]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:47]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count_raw': 1,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'},
 {'page_number': -38,
  'page_char_count': 212,
  'page_word_count': 32,
  'page_sentence_count_raw': 1,
  'page_token_count': 53.0,
  'text': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food 

In [7]:
random.sample(pages_and_texts, k=5)

[{'page_number': 1065,
  'page_char_count': 1427,
  'page_word_count': 249,
  'page_sentence_count_raw': 22,
  'page_token_count': 356.75,
  'text': 'reviews of randomized clinical trials reported that on average,  obesity treatments cause weight gain.56\xa0 This additional weight gain  leads to an increase in the set point, making it more difficult for an  individual to lose weight in the future. \xa0 Others reported a 3-5 %  weight loss was possible 4 years later if participants continued all  aspects of treatment.7\xa0\xa0For a 200 pound person, this represents a  6-10 pound weight loss.\xa0 The health benefits of this modest weight  loss are unclear and it is far less what is expected or desired when  following a diet.\xa0 In conclusion, the diet industry makes money from  a product that is proven not to work.  5.\xa0Mann, T., Tomiyama, A. J., Westling, E., Lew, A.-M.,  Samuels, B., & Chatman, J. (2007). Medicare’s search for  effective obesity treatments: Diets are not the answer.

In [8]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


# sentencizer
This code demonstrates how to use the spacy library to identify sentences in a given text. It starts by importing the English language model and adding a "sentencizer" pipeline to the model, which is specifically designed to segment text into sentences. An example text, "This is a sentence. This another sentence.", is processed by the model to create a doc object. The doc.sents attribute is used to extract the sentences, which are converted to a list. The assert statement ensures that the text is correctly segmented into two sentences. Finally, list(doc.sents) provides the segmented sentences as a list.

In [9]:
 # see https://spacy.io/usage for install instructions
nlp = English()
# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
nlp.add_pipe("sentencizer")
# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2
# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

# SpaCy pipeline
This code processes each page's text from the pages_and_texts list to extract and count sentences using SpaCy's nlp pipeline. For each item in the list, it applies the SpaCy pipeline (nlp) to the "text" field, which segments the text into sentences and stores them in the "sentences" field as a list. It then ensures all sentences are converted to strings using a list comprehension. Finally, the total number of sentences for that page is calculated as the length of the "sentences" list and stored in the "page_sentence_count_spacy" field. This ensures accurate sentence segmentation and counts for each page.

In [10]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

# split_list

This code divides the sentences from each page of text in pages_and_texts into smaller chunks of a specified size (num_sentence_chunk_size, set to 10 in this case). It defines a split_list function that splits a given list into sublists of the desired size (slice_size) using list slicing. Each page's "sentences" field is processed using this function, creating sentence chunks stored in the "sentence_chunks" field for that page. Additionally, the total number of chunks is calculated as the length of the "sentence_chunks" list and stored in the "num_chunks" field. This approach efficiently organizes sentences into manageable groups for further analysis or processing.

In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 979,
  'page_char_count': 1715,
  'page_word_count': 298,
  'page_sentence_count_raw': 11,
  'page_token_count': 428.75,
  'text': '• Amino Acid Supplements. Certain amino acid supplements,  which are often taken by bodybuilders among others, can  increase the risk of consuming too much protein. An  occasional amino acid drink in the place of a meal is not a  problem. However, problems may arise if you add the  supplement to your existing diet. Most Americans receive two  to three times the amount of protein required on a daily basis  from their existing diets—taking amino acid supplements just  adds to the excess. Also, certain amino acids share the same  transport systems in the absorption process; therefore, a  concentrated excess of one amino acid obtained from a  supplement may increase the probability of decreased  absorption of another amino acid that uses the same transport  system. This could lead to deficiency in the competing amino  acid.  Supplement Claims 

In [19]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


# pages_and_chunks

This code creates a new list, pages_and_chunks, where each chunk of sentences (from the original pages_and_texts list) becomes its own dictionary entry with detailed metadata. For each item in pages_and_texts, it loops through the "sentence_chunks" field, joining each chunk's sentences into a single string (a paragraph-like structure). The string is cleaned, ensuring proper spacing and formatting, including a regular expression to fix punctuation spacing (e.g., ".A" becomes ". A"). Each chunk's metadata is calculated, including character count (chunk_char_count), word count (chunk_word_count), and approximate token count (chunk_token_count, based on the assumption that 1 token equals ~4 characters). Finally, the processed chunk is appended to the pages_and_chunks list. At the end, len(pages_and_chunks) provides the total number of sentence chunks created.

In [52]:
# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [55]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 470,
  'sentence_chunk': 'Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user experience it is strongly recommended that users complete these activities using a desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=301 \xa0 An interactive or media element has been excluded from this version of the text. You can 470 | The Atom',
  'chunk_char_count': 813,
  'chunk_word_count': 120,
  'chunk_token_count': 203.25}]

In [56]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [58]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


# Show random chunks with under 30 tokens in length

In [59]:

min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 16.25 | Text: Updated January 2015. Accessed December 4, 2017. Middle Age | 917
Chunk token count: 23.25 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=455   Infancy | 851
Chunk token count: 13.0 | Text: PART VII CHAPTER 7. ALCOHOL Chapter 7. Alcohol | 429
Chunk token count: 11.0 | Text: 978 | Food Supplements and Food Replacements
Chunk token count: 19.25 | Text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=519   Introduction | 991


# Filtering
This below code filters the chunks of text to include only those that have a token count greater than a specified minimum length (min_token_length, set to 30).

In [60]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

# Embedding creator
The code uses the SentenceTransformers library to generate sentence embeddings, which are numerical representations of textual data. It initializes a SentenceTransformer model with the pre-trained all-mpnet-base-v2 model, configured to run on the CPU. A list of sentences is defined, and the model.encode() method is used to encode these sentences into embeddings. The sentences and their corresponding embeddings are then stored in a dictionary, where the sentences serve as keys and the embeddings as values. Finally, the code iterates through the dictionary to print each sentence alongside its embedding. This enables the embeddings to be used for tasks such as semantic similarity, clustering, or other natural language processing applications.

In [61]:
# Requires !pip install sentence-transformers
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")



Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981393e-02  3.03164497e-02 -2.01217979e-02  6.86483979e-02
 -2.55255587e-02 -8.47692136e-03 -2.07078308e-04 -6.32377341e-02
  2.81606130e-02 -3.33353430e-02  3.02634742e-02  5.30720465e-02
 -5.03526554e-02  2.62287818e-02  3.33313867e-02 -4.51579243e-02
  3.63044068e-02 -1.37113256e-03 -1.20170955e-02  1.14946328e-02
  5.04511520e-02  4.70857695e-02  2.11913157e-02  5.14607355e-02
 -2.03746576e-02 -3.58888917e-02 -6.67874468e-04 -2.94393655e-02
  4.95858677e-02 -1.05639929e-02 -1.52013954e-02 -1.31754915e-03
  4.48197387e-02  1.56023372e-02  8.60379657e-07 -1.21398771e-03
 -2.37978715e-02 -9.09400522e-04  7.34478515e-03 -2.53933389e-03
  5.23370095e-02 -4.68043461e-02  1.66214556e-02  4.71579246e-02
 -4.15599495e-02  9.01974272e-04  3.60279791e-02  3.42214853e-02
  9.68228132e-02  5.94828278e-02 -1.64984707e-02 -3.51249464e-02
  5.92516921e-03 -7.07960688e-04 -2.4103

# Embed all texts line by line, we can use the below method or if we have GPU then there is another way "batchwise" below, we can use any one

In [63]:
%%time

# Send the model to the GPU
embedding_model.to("cpu") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: total: 30min 31s
Wall time: 5min 16s


# Turn text chunks into a single list

In [64]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

# batchwise

In [65]:


text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])

# Save embeddings to file

In [66]:

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [67]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242526e-02 9.02281031e-02 -5.09549025e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156381e-02 5.92138581e-02 -1.66167654e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801767e-02 3.39813717e-02 -2.06426829e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566538e-02 3.81274410e-02 -8.46850406e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264010e-02 -8.49768426e-03 9.57162492e-...


# This code prepares a dataset of text chunks and their corresponding embeddings for further processing using PyTorch.

In [68]:

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1680, 768])

In [69]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,"[0.0674242526, 0.0902281031, -0.00509549025, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,"[0.0552156381, 0.0592138581, -0.0166167654, -0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,"[0.0279801767, 0.0339813717, -0.0206426829, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,"[0.0682566538, 0.038127441, -0.00846850406, -0..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,"[0.033026401, -0.00849768426, 0.00957162492, -..."


# Qdrant
Qdrant can be set up either as an in-memory instance or an on-disk instance, depending on your use case. An in-memory instance, created by passing ":memory:" as the path argument (e.g., QdrantClient(":memory:")), operates entirely in the system's RAM. This configuration is ideal for quick experiments, prototyping, or testing, as it provides faster operations due to the absence of disk I/O. However, because it operates in memory, all stored data will be lost when the program terminates. On the other hand, an on-disk instance is created by providing a valid file path to the path argument (e.g., QdrantClient(path="qdrant_storage")). This configuration persists data to disk, allowing the embeddings, collections, and metadata to remain intact across program runs. It is best suited for production scenarios or situations where the stored vectors need to be retained for long-term use. Additionally, for distributed systems or large-scale applications, a server-based instance can be used by connecting to a running Qdrant server via its URL (e.g., QdrantClient(url="http://localhost:6333")). This enables centralized storage and management of vector embeddings and allows multiple clients or applications to query and update the same collection. Each method provides flexibility, with in-memory instances optimized for speed and temporary usage, on-disk instances for persistence in standalone applications, and server-based instances for scalability and distributed access.

In [70]:

qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

# all-mpnet-base-v2

The line initializes a SentenceTransformer object using the pre-trained model 'all-mpnet-base-v2'. This model is part of the Sentence Transformers library and is specifically designed for generating dense vector embeddings of text. Here's a detailed breakdown:

Purpose of SentenceTransformer('all-mpnet-base-v2'):
Embedding Model: 'all-mpnet-base-v2' is a transformer-based language model fine-tuned for sentence embeddings. It provides high-quality embeddings optimized for tasks like semantic similarity, clustering, and search.
Sentence Embeddings: The model converts text (e.g., sentences, paragraphs) into dense numerical vectors (embeddings) that capture semantic meaning.
Pre-Trained Weights: This model comes pre-trained on large text datasets, making it suitable for a variety of natural language processing (NLP) tasks without requiring further training.
Use Case:
Semantic Search: Use the embeddings to find semantically similar text, such as retrieving documents or answers to queries.
Clustering: Group related text data into meaningful clusters based on their embeddings.
Recommendation Systems: Build recommendation engines by finding similar text or items using embedding proximity.
Text Classification: Use the embeddings as input features for downstream classification tasks.
Cross-Lingual Applications: 'all-mpnet-base-v2' supports multiple languages, making it suitable for multilingual semantic tasks.
Why 'all-mpnet-base-v2'?
This model is one of the most accurate and efficient sentence embedding models, the MPNET architecture and trained with a wide range of datasets for general-purpose embeddings. It is particularly good for tasks where capturing nuanced semantic relationships between texts is crucial.

In [71]:
encoder = SentenceTransformer('all-mpnet-base-v2') # Model to create embeddings



# Create collection to store books
This code snippet creates a Qdrant collection named "health" to store vector embeddings, configuring it for a specific vector size and distance metric. Here's a detailed explanation:

recreate_collection: This method ensures that the collection is created anew. If a collection with the same name already exists, it deletes and recreates it. This is useful for starting with a clean slate when testing or updating data.

collection_name="health": The name of the collection is set to "health". A collection in Qdrant serves as a container for storing related embeddings (e.g., vectors representing text or data about health topics in this case).

models.VectorParams: Defines the configuration for the vectors in the collection:

size=encoder.get_sentence_embedding_dimension(): Specifies the dimensionality of the vectors, which is determined by the embedding model (e.g., SentenceTransformer). This ensures the collection can store vectors of the correct size.
distance=models.Distance.COSINE: Sets the similarity measure to cosine similarity, which is commonly used for comparing high-dimensional embeddings (e.g., to find similar books or text chunks based on their semantic meaning).
Use Case:
This setup is designed for storing and querying embeddings, such as those representing books, documents, or other text data. The "health" collection can be used to store embeddings related to health-related topics, enabling fast similarity searches. For example:

Search Applications: Quickly retrieve books or documents related to a query by finding vectors with high cosine similarity to the query embedding.
Recommendation Systems: Recommend similar content based on embedding similarity.
Clustering or Analysis: Group related health topics based on vector proximity.

In [72]:
qdrant.recreate_collection(
    collection_name="health",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

  qdrant.recreate_collection(


True

In [73]:
data = df.to_dict('records')

# vectorize!
This code uploads vectorized records into the health collection in Qdrant. The qdrant.upload_records method takes a list of records, where each record is created using the models.Record class. Each record includes a unique identifier (id), a vector embedding (vector), and a payload containing additional metadata (payload). The embeddings are generated by encoding the sentence_chunk field from the input data (data) using the encoder (a pre-trained SentenceTransformer model). The embeddings are converted to a list format before being added to the records. The enumerate function ensures that each record gets a unique id by using its index in the dataset. The uploaded records are then stored in the health collection, enabling efficient vector search and similarity queries. This process prepares the collection for semantic search or recommendation tasks based on the uploaded embeddings and metadata.

In [74]:
# vectorize!
qdrant.upload_records(
    collection_name="health",
    records=[
        models.Record(
            id=idx,
            vector=encoder.encode(doc["sentence_chunk"]).tolist(),
            payload=doc
        ) for idx, doc in enumerate(data) 
    ]
)

  qdrant.upload_records(


This code performs a semantic search on the health collection in Qdrant to retrieve the top 3 records most relevant to the query "macronutrients functions." It uses the qdrant.search method, where the query is encoded into a vector using the encoder (a SentenceTransformer model) and converted to a list format. The search cosine similarity (as defined in the collection setup) to find records that are closest to the query vector. The results (hits) are iterated over, and for each result, the payload (metadata or content associated with the record) and the similarity score are printed. This process enables efficient semantic search, allowing users to find and retrieve the most relevant health-related information or content based on their natural language query.

In [75]:
hits = qdrant.search(
    collection_name="health",
    query_vector=encoder.encode("macronutrients functions").tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'page_number': 5, 'sentence_chunk': 'Macronutrients Nutrients that are needed in large amounts are called macronutrients. There are three classes of macronutrients: carbohydrates, lipids, and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds. This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions. A unit of measurement of food energy is the calorie. On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand. A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carb

In [76]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]

In [77]:
search_results

[{'page_number': 5,
  'sentence_chunk': 'Macronutrients Nutrients that are needed in large amounts are called macronutrients. There are three classes of macronutrients: carbohydrates, lipids, and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds. This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions. A unit of measurement of food energy is the calorie. On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand. A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of c

In [78]:
context = "\n".join([
        f"- {result['page_number']} ({result['chunk_char_count']}): {result['sentence_chunk']}"
        for result in search_results
    ])

In [79]:
context

'- 5 (987): Macronutrients Nutrients that are needed in large amounts are called macronutrients. There are three classes of macronutrients: carbohydrates, lipids, and proteins. These can be metabolically processed into cellular energy. The energy from macronutrients comes from their chemical bonds. This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions. A unit of measurement of food energy is the calorie. On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand. A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.\

# llama3_with_search_results

This code defines a function, llama3_with_search_results, to interact with a locally hosted LLaMA API and generate responses by combining search results, a system message, and a user query. The search results, which include metadata such as page number, character count, and text chunks, are formatted into a structured context string. This context is then combined with a system message (defining the chatbot's role, such as a health specialist) and the user query to form a single prompt. The function sends this prompt to the LLaMA API via a POST request, streaming the response line by line, parsing it as JSON, and assembling the final output. Error handling ensures that any issues during the API call are captured and returned as an error message. The example usage demonstrates how to use this function, showcasing a scenario where a chatbot provides health-related information based on user queries and relevant search results. This integration enhances the chatbot's ability to generate accurate, contextually aware responses by the combined power of semantic search and generative AI.

In [81]:
import requests
import json

# Function to query LLaMA API with search results and a system message
def llama3_with_search_results(search_results, system_message, user_message):
    url = "http://localhost:11434/api/generate"

    # Format search results as context
    context = "\n".join([
        f"- {result['page_number']} ({result['chunk_char_count']}): {result['sentence_chunk']}"
        for result in search_results
    ])

    # Combine system message, search results context, and user message into the final prompt
    prompt = f"{system_message}\n\nContext:\n{context}\n\nUser Query: {user_message}"

    # Payload for LLaMA API
    payload = {
        "model": "llama3",
        "prompt": prompt
    }

    try:
        # Stream the response
        response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, stream=True)

        # Check for successful status
        if response.status_code == 200:
            # Concatenate all response parts
            final_response = ""
            for line in response.iter_lines():
                if line:
                    # Parse each line as JSON
                    part = json.loads(line)
                    final_response += part.get("response", "")
            return final_response
        else:
            return f"Error: {response.status_code}, {response.text}"
    except Exception as e:
        return f"An error occurred: {e}"
# Example usage
if __name__ == "__main__":
    # Mock search results (replace this with the actual results from Qdrant)
    # System and user messages
    system_message = "You are a chatbot, a health specialist. Your top priority is to help guide users into providing health related information"
    user_message = "what is macronutrients functions"

    # Get response from LLaMA
    response = llama3_with_search_results(search_results, system_message, user_message)
    print("LLaMA Response:\n", response)


LLaMA Response:
 Based on the provided context, I'd be happy to help you understand the functions of macronutrients.

Macronutrients are nutrients that are needed in large amounts by the body. The three classes of macronutrients are carbohydrates, lipids (or fats), and proteins. These macronutrients can be metabolically processed into cellular energy, which is then utilized to perform various bodily functions.

Here's a breakdown of the functions of each macronutrient:

1. **Carbohydrates**: Carbohydrates provide energy for the body. They are broken down into glucose, which is then used by cells to produce energy.
2. **Lipids (or Fats)**: Lipids provide energy and help maintain cell membrane structure. They also serve as a storage form of energy.
3. **Proteins**: Proteins build and repair tissues in the body, such as muscles, bones, skin, and hair. They are also involved in many bodily functions, including immune function and hormone regulation.

In summary, macronutrients play a cruci