In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    filename = pdf_path
    response = requests.get(url)

    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)



In [2]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append(
            {
                "page_number": page_number - 41,
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "page_sentence_count_raw": len(text.split(", ")),
                "page_token_count": len(text) / 4,
                "text": text
            }
        )
    
    return pages_and_texts


pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
pages_and_texts[100]

{'page_number': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count_raw': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59'}

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,11,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,2,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,1,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x29a3fa0f450>

In [6]:
a = pages_and_texts[100]
s = nlp(a['text']).sents

In [7]:
[print(str(se)) for se in s]

Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health.
Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function.
As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness.
In this  chapter we will explore not only immune system function, but also  Introduction  |  59


[None, None, None, None]

In [8]:
a

{'page_number': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count_raw': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59'}

In [9]:
a['sentences'] = list(nlp(a['text']).sents)

In [10]:
a

{'page_number': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count_raw': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59',
 'sentences': [Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health.,
  Recent  scient

In [11]:
from textblob import Sentence
import random

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item['text']).sents)
    item['sentences'] = [str(s) for s in item['sentences']]
    item['spacy_sent_count'] = len(item['sentences'])

random.sample(pages_and_texts, k=1)

  0%|          | 0/1208 [00:00<?, ?it/s]

[{'page_number': 976,
  'page_char_count': 510,
  'page_word_count': 85,
  'page_sentence_count_raw': 6,
  'page_token_count': 127.5,
  'text': 'downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=512  \xa0 976  |  Water and Electrolyte Needs',
  'sentences': ['downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).',
   ' Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
   ' \xa0 An

In [12]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,spacy_sent_count
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.0,,[],0
2,-39,320,54,11,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,212,32,2,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,797,145,1,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1208 entries, 0 to 1207
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   page_number              1208 non-null   int64  
 1   page_char_count          1208 non-null   int64  
 2   page_word_count          1208 non-null   int64  
 3   page_sentence_count_raw  1208 non-null   int64  
 4   page_token_count         1208 non-null   float64
 5   text                     1208 non-null   object 
 6   sentences                1208 non-null   object 
 7   spacy_sent_count         1208 non-null   int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 75.6+ KB


In [14]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,spacy_sent_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,198.299669,11.053808,287.001035,10.319536
std,348.86387,560.382275,95.759336,8.950638,140.095569,6.300843
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.875,10.0
75%,864.25,1603.5,271.0,15.0,400.875,15.0
max,1166.0,2308.0,429.0,106.0,577.0,28.0


In [15]:
item

{'page_number': 1166,
 'page_char_count': 257,
 'page_word_count': 44,
 'page_sentence_count_raw': 1,
 'page_token_count': 64.25,
 'text': '23. Vitamin D reused “The Functions of Vitamin D” by Allison  Calabrese / Attribution – Sharealike  24. Vitamin K reused “Kale Lacinato Lacinato Kale” by BlackRiv\xa0/  Pixabay License; “Phylloquinone structure” by Mysid\xa0/ Public  Domain  1166  |  Attributions',
 'sentences': ['23.',
  'Vitamin D reused “The Functions of Vitamin D” by Allison  Calabrese / Attribution – Sharealike  24.',
  'Vitamin K reused “Kale Lacinato Lacinato Kale” by BlackRiv\xa0/  Pixabay License; “Phylloquinone structure” by Mysid\xa0/ Public  Domain  1166  |  Attributions'],
 'spacy_sent_count': 3}

In [16]:
chunk_size = 10

def split_list(input_list: list[str], slice_size: int=chunk_size) -> list[list[str]]:
    return [
        input_list[i:i+slice_size]
        for i in range(0, len(input_list), slice_size)
    ]

for item in tqdm(pages_and_texts):
    item['sentence_chunks'] = split_list(
        input_list=item['sentences'],
        slice_size=chunk_size
    )
    item['num_chunks'] = len(item['sentence_chunks'])

random.sample(pages_and_texts, k=1)

  0%|          | 0/1208 [00:00<?, ?it/s]

[{'page_number': 302,
  'page_char_count': 1027,
  'page_word_count': 181,
  'page_sentence_count_raw': 6,
  'page_token_count': 256.75,
  'text': 'Smell and Taste  Fat contains dissolved compounds that contribute to mouth- watering aromas and flavors. Fat also adds texture to food. Baked  foods are supple and moist. Frying foods locks in flavor and lessens  cooking time. How long does it take you to recall the smell of your  favorite food cooking? What would a meal be without that savory  aroma to delight your senses and heighten your preparedness for  eating a meal?  Fat plays another valuable role in nutrition. Fat contributes to  satiety, or the sensation of fullness. When fatty foods are swallowed  the body responds by enabling the processes controlling digestion  to retard the movement of food along the digestive tract, thus  promoting an overall sense of fullness. Oftentimes before the feeling  of fullness arrives, people overindulge in fat-rich foods, finding the  delectable ta

In [17]:
pages_and_texts[0]

{'page_number': -41,
 'page_char_count': 29,
 'page_word_count': 4,
 'page_sentence_count_raw': 1,
 'page_token_count': 7.25,
 'text': 'Human Nutrition: 2020 Edition',
 'sentences': ['Human Nutrition: 2020 Edition'],
 'spacy_sent_count': 1,
 'sentence_chunks': [['Human Nutrition: 2020 Edition']],
 'num_chunks': 1}

In [18]:
import re

from torch import chunk

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict['page_number'] = item['page_number']

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.(A-Z)', r'. \1', joined_sentence_chunk)
        chunk_dict['sentence_chunk'] = joined_sentence_chunk

        chunk_dict['chunk_char_count'] = len(joined_sentence_chunk)
        chunk_dict['chunk_word_count'] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict['chunk_token_count'] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)
    
len(pages_and_chunks)    

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [19]:
df = pd.DataFrame(pages_and_chunks)
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df['chunk_token_count'] > min_token_length].to_dict(orient='records')
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [20]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item['embedding'] = embedding_model.encode(item['sentence_chunk'])



  0%|          | 0/1680 [00:00<?, ?it/s]

In [21]:
pages_and_chunks_over_min_token_len[0]['embedding']

array([ 6.74242601e-02,  9.02280211e-02, -5.09550748e-03, -3.17545123e-02,
        7.39082098e-02,  3.51975970e-02, -1.97987389e-02,  4.67692316e-02,
        5.35727032e-02,  5.01230871e-03,  3.33928168e-02, -1.62221945e-03,
        1.76081192e-02,  3.62653509e-02, -3.16767226e-04, -1.07117658e-02,
        1.54257240e-02,  2.62176748e-02,  2.77661392e-03,  3.64942662e-02,
       -4.44109477e-02,  1.89362131e-02,  4.90117483e-02,  1.64020881e-02,
       -4.85783182e-02,  3.18282633e-03,  2.72992365e-02, -2.04752106e-03,
       -1.22828800e-02, -7.28049055e-02,  1.20446123e-02,  1.07300002e-02,
        2.09996291e-03, -8.17772970e-02,  2.67830114e-06, -1.81429237e-02,
       -1.20803546e-02,  2.47174744e-02, -6.27467111e-02,  7.35437647e-02,
        2.21624877e-02, -3.28767933e-02, -1.80095136e-02,  2.22952478e-02,
        5.61365150e-02,  1.79513067e-03,  5.25932014e-02, -3.31735611e-03,
       -8.33876524e-03, -1.06285121e-02,  2.31922977e-03, -2.23933738e-02,
       -1.53010953e-02, -

In [22]:
type(pages_and_chunks_over_min_token_len[0]['embedding'])

numpy.ndarray

In [23]:
pages_and_chunks_over_min_token_len[0]['embedding'].shape

(768,)

In [24]:
type(pages_and_chunks_over_min_token_len)

list

In [25]:
pages_and_chunks_over_min_token_len[0]['sentence_chunk']

'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE'

In [26]:
text_chunks = [item['sentence_chunk'] for item in pages_and_chunks_over_min_token_len]
text_chunks[0]

'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE'

In [27]:
text_chunks[9]

'Defining Protein University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 363 The Role of Proteins in Foods: Cooking and Denaturation University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 374 Protein Digestion and Absorption University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 378 Protein’s Functions in the Body University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 383 Diseases Involving Proteins University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 395 Proteins in a Nutshell University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 405 Proteins, Diet, and Personal Choices University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 409'

In [28]:
len(text_chunks)

1680

In [29]:
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=16,
                                               convert_to_tensor=True)
text_chunk_embeddings[0]

tensor([ 6.7424e-02,  9.0228e-02, -5.0955e-03, -3.1755e-02,  7.3908e-02,
         3.5198e-02, -1.9799e-02,  4.6769e-02,  5.3573e-02,  5.0123e-03,
         3.3393e-02, -1.6222e-03,  1.7608e-02,  3.6265e-02, -3.1677e-04,
        -1.0712e-02,  1.5426e-02,  2.6218e-02,  2.7766e-03,  3.6494e-02,
        -4.4411e-02,  1.8936e-02,  4.9012e-02,  1.6402e-02, -4.8578e-02,
         3.1828e-03,  2.7299e-02, -2.0475e-03, -1.2283e-02, -7.2805e-02,
         1.2045e-02,  1.0730e-02,  2.1000e-03, -8.1777e-02,  2.6783e-06,
        -1.8143e-02, -1.2080e-02,  2.4717e-02, -6.2747e-02,  7.3544e-02,
         2.2162e-02, -3.2877e-02, -1.8010e-02,  2.2295e-02,  5.6137e-02,
         1.7951e-03,  5.2593e-02, -3.3174e-03, -8.3388e-03, -1.0629e-02,
         2.3192e-03, -2.2393e-02, -1.5301e-02, -9.9306e-03,  4.6532e-02,
         3.5747e-02, -2.5476e-02,  2.6370e-02,  3.7492e-03, -3.8268e-02,
         2.5833e-02,  4.1287e-02,  2.5818e-02,  3.3297e-02, -2.5178e-02,
         4.5152e-02,  4.4899e-04, -9.9662e-02,  4.9

In [30]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
save_path = 'text_chunks_and_embeddings_df.csv'
text_chunks_and_embeddings_df.to_csv(save_path, index=False)

In [31]:
text_chunks_and_embeddings_df_load = pd.read_csv(save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242601e-02 9.02280211e-02 -5.09550748e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156493e-02 5.92138097e-02 -1.66167859e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,765,113,191.25,[ 2.79801544e-02 3.39813344e-02 -2.06427332e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,141,235.0,[ 6.82566687e-02 3.81274596e-02 -8.46859440e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264233e-02 -8.49772617e-03 9.57152713e-...


In [41]:
import torch
import numpy as np

text_chunks_and_embeddings_df = pd.read_csv(save_path)


In [42]:
text_chunks_and_embeddings_df['embedding'].head()

0    [ 6.74242601e-02  9.02280211e-02 -5.09550748e-...
1    [ 5.52156493e-02  5.92138097e-02 -1.66167859e-...
2    [ 2.79801544e-02  3.39813344e-02 -2.06427332e-...
3    [ 6.82566687e-02  3.81274596e-02 -8.46859440e-...
4    [ 3.30264233e-02 -8.49772617e-03  9.57152713e-...
Name: embedding, dtype: object

In [43]:

text_chunks_and_embeddings_df['embedding'] = text_chunks_and_embeddings_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

In [45]:
embeddings = torch.tensor(np.stack(
    text_chunks_and_embeddings_df['embedding'].tolist(),
    axis=0),
    dtype=torch.float32
    )

In [46]:
pages_and_chunks = text = text_chunks_and_embeddings_df.to_dict(orient="records")
text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.0674242601, 0.0902280211, -0.00509550748, -..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.0552156493, 0.0592138097, -0.0166167859, -0..."
2,-37,Contents Preface University of Hawai‘i at Māno...,765,113,191.25,"[0.0279801544, 0.0339813344, -0.0206427332, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,141,235.00,"[0.0682566687, 0.0381274596, -0.0084685944, -0..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.0330264233, -0.00849772617, 0.00957152713, ..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1298,169,324.50,"[0.0185622983, -0.0164279416, -0.0127046509, -..."
1676,1164,Hazard Analysis Critical Control Points reused...,373,49,93.25,"[0.0334722139, -0.0570441, 0.0151489358, -0.01..."
1677,1165,ShareAlike 11.Organs reused “Pancreas Organ An...,1277,164,319.25,"[0.077051416, 0.00978544448, -0.0121817002, 0...."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,408,57,102.00,"[0.103045113, -0.0164702646, 0.00826841127, 0...."


In [47]:
embeddings.shape

torch.Size([1680, 768])

In [48]:
from sentence_transformers import util

query = "macronutrients functions"
print(f"Query : {query}")

query_embedding  = embedding_model.encode(query, convert_to_tensor=True)
dot_scores = util.dot_score(query_embedding, embeddings)[0]

top_results = torch.topk(dot_scores, k=5)
top_results

Query : macronutrients functions


torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473]),
indices=tensor([42, 47, 41, 51, 46]))

In [52]:
dsArray = util.dot_score(query_embedding, embeddings)
type(dsArray)

torch.Tensor

In [53]:
dsArray.shape

torch.Size([1, 1680])

In [58]:
dsArray[0]

tensor([0.4343, 0.4406, 0.3667,  ..., 0.3941, 0.3321, 0.3707])

In [57]:
top5 = torch.topk(dsArray[0], k=5)
top5

torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473]),
indices=tensor([42, 47, 41, 51, 46]))

In [49]:
top_results[0]

tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473])

In [50]:
top_results[1]

tensor([42, 47, 41, 51, 46])

In [51]:
pages_and_chunks[42]['sentence_chunk']

'Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed into cellular energy.The energy from macronutrients comes from their chemical bonds.This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions.A unit of measurement of food energy is the calorie.On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand.A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels.Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.'

In [59]:
def retrieve_relevant_resources(query: str, n_resources_to_return: int=5):
    query_embedding = embedding_model.encode(
        query, convert_to_tensor=True
    )

    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    scores, indices = torch.topk(dot_scores, k=n_resources_to_return)
    return scores, indices

retrieve_relevant_resources(query)

(tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473]),
 tensor([42, 47, 41, 51, 46]))

In [60]:
def print_top_results_and_scores(query: str, n_resources_to_return: int=5):
    scores, indices = retrieve_relevant_resources(
        query, n_resources_to_return=n_resources_to_return
    )

    for score, idx in zip(scores, indices):
        print(f"Score : {score:.5f}")
        print("Text")
        print(pages_and_chunks[idx]['sentence_chunk'])
        print('\n\n')

print_top_results_and_scores(query)

Score : 0.69258
Text
Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed into cellular energy.The energy from macronutrients comes from their chemical bonds.This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions.A unit of measurement of food energy is the calorie.On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand.A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels.Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.