In [None]:
# download pdf

In [1]:
import os
import requests

# get the pdf document path
pdf_path = r"David and Goliath.pdf"

# download pdf
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading....")

    # enter the path of the pdf
    url = 'https://core-docs.s3.amazonaws.com/documents/asset/uploaded_file/222892/david-and-goliath.pdf'

    # the local filename of the downloaded file
    filename = pdf_path

    # send a get request to the url
    response = requests.get(url)

    # check if the request was successful
    if response.status_code == 200:
        # open the file and save it
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists")

File David and Goliath.pdf exists


In [2]:
# open and preprocess pdf

In [3]:
import fitz # requires: !pip install PyMuPDF
from tqdm.auto import tqdm # pip install tqdm

def text_formatter(text: str) -> str:
    """ Performs minor formatting on text. """
    cleaned_text = text.replace("\n", " ").strip()

    # potentially more text formatting functions go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text = text)
        pages_and_texts.append({"page_number" : page_number + 1, # page_number - 5,6 (adjust accoring to your pdf's starting page)
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~ 4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path = pdf_path)
pages_and_texts[:10]

0it [00:00, ?it/s]

[{'page_number': 1,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 3,
  'page_char_count': 86,
  'page_word_count': 15,
  'page_sentence_count_raw': 1,
  'page_token_count': 21.5,
  'text': 'Malcolm Gladwell   david and goliath Underdogs, Misfits and the Art of Battling Giants'},
 {'page_number': 4,
  'page_char_count': 891,
  'page_word_count': 156,
  'page_sentence_count_raw': 5,
  'page_token_count': 222.75,
  'text': 'Contents INTRODUCTION Goliath “Am I a dog that you should come to me with sticks?” PART ONE: THE ADVANTAGES OF DISADVANTAGES (AND THE DISADVANTAGES OF ADVANTAGES) ONE Vivek Ranadivé “It was really random. I mean, my father had never played basketball before.” TWO Teresa DeBrito “My largest class was twenty-nine kids. Oh, it was fun.” THRE

In [4]:
import random
random.sample(pages_and_texts, k = 3)

[{'page_number': 30,
  'page_char_count': 2679,
  'page_word_count': 480,
  'page_sentence_count_raw': 28,
  'page_token_count': 669.75,
  'text': 'lessons. Children of multimillionaires in Hollywood do not rake the leaves of their neighbors in Beverly Hills. Their fathers do not wave the electricity bill angrily at them if they leave the lights on. They do not sit in a basketball arena behind a pillar and wonder what it would be like to sit courtside. They live courtside. “My own instinct is that it’s much harder than anybody believes to bring up kids in a wealthy environment,” he said. “People are ruined by challenged economic lives. But they’re ruined by wealth as well because they lose their ambition and they lose their pride and they lose their sense of self-worth. It’s difficult at both ends of the spectrum. There’s some place in the middle which probably works best of all.” There are few things that inspire less sympathy than a multimillionaire crying the blues for his children,

In [5]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08
std,60.77,1563.33,269.38,16.45,390.83
min,1.0,0.0,1.0,1.0,0.0
25%,53.25,521.25,94.25,5.0,130.31
50%,105.5,2741.0,454.5,23.5,685.25
75%,157.75,3537.5,616.5,35.75,884.38
max,210.0,5888.0,955.0,65.0,1472.0


Why would we care about Token Counts? 

Token Count is important to think about because: 
1. Embedding models don't deal with infinite tokens.
2. LLMs don't deal with infinite tokens.

For example, an embedding model may have been trained to embed sequences of 384 tokens into numerical space. (sentence transformers 'all-mpnet-base-v2', https://www.sbert.net/docs/pretrained_models.html)

As for LLMs, they can't accept infinite tokens in their context window.

### further text processing. (splitting pages into sentences)

 two ways to do this
 1. by splitting on ". "
 2. with a NLP library such as Spacy and NLTK. 

In [6]:
#!pip install spacy

In [7]:
from spacy.lang.en import English

nlp = English()

# add a sentencizer pipeline, https://spacy.io/api/sentencizer
nlp.add_pipe("sentencizer")

# create example instance of a document 
doc = nlp("Hi. How are you? What is your name? Good to see you.")
assert len(list(doc.sents)) == 4

list(doc.sents)

[Hi., How are you?, What is your name?, Good to see you.]

In [8]:
pages_and_texts[42]

{'page_number': 43,
 'page_char_count': 3827,
 'page_word_count': 659,
 'page_sentence_count_raw': 36,
 'page_token_count': 956.75,
 'text': 'in defying and attracting criticism, coming face-to-face with the big public.” But the very things that made the Salon so attractive—how selective and prestigious it was—also made it problematic. The Palais was an enormous barn of a building three hundred yards long with a central aisle that was two stories high. A typical Salon might accept three or four thousand paintings, and they were hung in four tiers, starting at ground level and stretching up to the ceiling. Only paintings that met with the unanimous approval of the jury were hung “on the line,” at eye level. If you were “skyed”—that is, hung closest to the ceiling—it was all but impossible for your painting to be seen. (One of Renoir’s paintings was once skyed in the dépotoir.) No painter could submit more than three works. The crowds were often overwhelming. The Salon was the Big Pond. 

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # make sure all the sentences are strings (the default datatype is a spacy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/210 [00:00<?, ?it/s]

In [10]:
random.sample(pages_and_texts, k = 1)

[{'page_number': 4,
  'page_char_count': 891,
  'page_word_count': 156,
  'page_sentence_count_raw': 5,
  'page_token_count': 222.75,
  'text': 'Contents INTRODUCTION Goliath “Am I a dog that you should come to me with sticks?” PART ONE: THE ADVANTAGES OF DISADVANTAGES (AND THE DISADVANTAGES OF ADVANTAGES) ONE Vivek Ranadivé “It was really random. I mean, my father had never played basketball before.” TWO Teresa DeBrito “My largest class was twenty-nine kids. Oh, it was fun.” THREE Caroline Sacks “If I’d gone to the University of Maryland, I’d still be in science.” PART TWO: THE THEORY OF DESIRABLE DIFFICULTY FOUR David Boies You wouldn’t wish dyslexia on your child. Or would you? FIVE Emil “Jay” Freireich “How Jay did it, I don’t know.” SIX Wyatt Walker “De rabbit is de slickest o’ all de animals de Lawd ever made.” PART THREE: THE LIMITS OF POWER SEVEN Rosemary Lawlor “I wasn’t born that way. This was forced upon me.” EIGHT Wilma Derksen “We have all done something dreadful in our li

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,210.0,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08,25.62
std,60.77,1563.33,269.38,16.45,390.83,19.12
min,1.0,0.0,1.0,1.0,0.0,0.0
25%,53.25,521.25,94.25,5.0,130.31,5.0
50%,105.5,2741.0,454.5,23.5,685.25,28.0
75%,157.75,3537.5,616.5,35.75,884.38,41.0
max,210.0,5888.0,955.0,65.0,1472.0,70.0


### chunking the sentences together (splitting larger pieces into smaller ones is referred to as splitting or chunking). 
no 100% correct way to do this.
split into groups of 10 sentences for simplicity. the number can be changed.
frameworks like langchain might help. but doing this purely on python.

Why we do this?
1. so our texts are easier to filter. smaller groups of text can be easier to inspect than the large passages.
2. so our texts chunks fit into our embedding model context window (e.g. 384 tokens as limit).
3. so our contexts passed into LLMs can be specific and focused.

In [12]:
# define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# create a function to split lists of text recursively into chunk size
# e.g. [20] -> [10, 10], [25] -> [10, 10, 5]
def split_list(input_list: list[str], slice_size: int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
# loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list= item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/210 [00:00<?, ?it/s]

In [14]:
random.sample(pages_and_texts, k=1)

[{'page_number': 44,
  'page_char_count': 3376,
  'page_word_count': 617,
  'page_sentence_count_raw': 35,
  'page_token_count': 844.0,
  'text': 'The lesson of the Impressionists is that there are times and places where it is better to be a Big Fish in a Little Pond than a Little Fish in a Big Pond, where the apparent disadvantage of being an outsider in a marginal world turns out not to be a disadvantage at all. Pissarro, Monet, Renoir, and Cézanne weighed prestige against visibility, selectivity against freedom, and decided the costs of the Big Pond were too great. Caroline Sacks faced the same choice. She could be a Big Fish at the University of Maryland, or a Little Fish at one of the most prestigious universities in the world. She chose the Salon over the three rooms on Boulevard des Capucines—and she ended up paying a high price. 4. The trouble for Caroline Sacks began in the spring of her freshman year, when she enrolled in chemistry. She was probably taking too many courses, s

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08,25.62,3.05
std,60.77,1563.33,269.38,16.45,390.83,19.12,1.85
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,53.25,521.25,94.25,5.0,130.31,5.0,1.0
50%,105.5,2741.0,454.5,23.5,685.25,28.0,3.0
75%,157.75,3537.5,616.5,35.75,884.38,41.0,5.0
max,210.0,5888.0,955.0,65.0,1472.0,70.0,7.0


### Splitting each chunk into its own item

embed chunk of sentences into its own numerical representation. that'll give us a good granularity.

meaning, we can dive specifically into the text sample that was used in our model.

In [16]:
import re

#split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # join the sentences together into a paragraph like structure (join the list of sentences into one paragraph)
        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        #get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 #1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)
        

  0%|          | 0/210 [00:00<?, ?it/s]

641

In [17]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 28,
  'sentence_chunk': 'It’s safe to say that there isn’t a single profession in the world that has increased its numbers over the past two decades by as much or as quickly or at such expense as teaching has. One country after another has spent that kind of money because we look at a school like Shepaug Valley—where every teacher has a chance to get to know every student—and we think, “There’s the place to send my child.”But the evidence suggests that the thing we are convinced is such a big advantage might not be such an advantage at all.*3. Not long ago, I sat down with one of the most powerful people in Hollywood. He began by talking about his childhood in Minneapolis. He would go up and down the streets of his neighborhood at the beginning of every winter, he said, getting commitments from people who wanted their driveways and sidewalks cleared of snow. Then he would contract out each job to other children in the neighborhood. He paid his workers the moment the jo

In [18]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,641.0,641.0,641.0,641.0
mean,91.22,728.18,125.61,182.04
std,49.89,353.93,58.94,88.48
min,3.0,2.0,1.0,0.5
25%,52.0,498.0,87.0,124.5
50%,89.0,718.0,126.0,179.5
75%,127.0,954.0,163.0,238.5
max,210.0,2112.0,358.0,528.0


In [19]:
528 - 384

144

In [20]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,3,Malcolm Gladwell david and goliath Underdogs...,86,15,21.5
1,4,Contents INTRODUCTION Goliath “Am I a dog that...,677,112,169.25
2,4,PART THREE: THE LIMITS OF POWER SEVEN Rosemary...,205,36,51.25
3,5,André Trocmé “We feel obliged to tell you that...,128,21,32.0
4,6,BY THE SAME AUTHOR What the Dog Saw Outliers B...,68,13,17.0


### filter chunks of text for short chunks
these chunks may not contain much useful information

In [21]:
# show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 17.0 | Text: BY THE SAME AUTHOR What the Dog Saw Outliers Blink The Tipping Point
Chunk token count: 16.0 | Text: The crowd grew angrier. Stones turned to petrol bombs and petrol
Chunk token count: 23.0 | Text: Later in the war, when the Gestapo stepped up their scrutiny of Le Chambon, Trocmé and Theis
Chunk token count: 10.25 | Text: I wouldn’t be where I am today without my
Chunk token count: 5.75 | Text: Do you say, ‘You’ve got


In [22]:
# filter our dataframe for rows with under 30 tokens
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_length[:2]

[{'page_number': 4,
  'sentence_chunk': 'Contents INTRODUCTION Goliath “Am I a dog that you should come to me with sticks?”PART ONE: THE ADVANTAGES OF DISADVANTAGES (AND THE DISADVANTAGES OF ADVANTAGES) ONE Vivek Ranadivé “It was really random. I mean, my father had never played basketball before.”TWO Teresa DeBrito “My largest class was twenty-nine kids. Oh, it was fun.”THREE Caroline Sacks “If I’d gone to the University of Maryland, I’d still be in science.”PART TWO: THE THEORY OF DESIRABLE DIFFICULTY FOUR David Boies You wouldn’t wish dyslexia on your child. Or would you?FIVE Emil “Jay” Freireich “How Jay did it, I don’t know.”SIX Wyatt Walker “De rabbit is de slickest o’ all de animals de Lawd ever made.”',
  'chunk_char_count': 677,
  'chunk_word_count': 112,
  'chunk_token_count': 169.25},
 {'page_number': 4,
  'sentence_chunk': 'PART THREE: THE LIMITS OF POWER SEVEN Rosemary Lawlor “I wasn’t born that way. This was forced upon me.”EIGHT Wilma Derksen “We have all done something 

In [23]:
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': 33,
  'sentence_chunk': 'He was at the point where money starts to make the job of raising normal and well-adjusted children more difficult. What the parenting graph really looks like is this: That’s what is called an inverted-U curve. Inverted-U curves are hard to understand. They almost never fail to take us by surprise, and one of the reasons we are so often confused about advantages and disadvantages is that we forget when we are operating in a U-shaped world.*Which brings us back to the puzzle of class size: What if the relationship between the number of children in a classroom and academic performance is not this:',
  'chunk_char_count': 602,
  'chunk_word_count': 104,
  'chunk_token_count': 150.5}]

### Embedding the text chunks
Embeddings are broad but powerful concepts. While humans understand text, machines understand numbers. What we'd like to do is turn our text chunks into numbers, specifically embeddings. 

A useful numerical representation. The best part about embeddings is that are a *learned representation*.
https://vickiboykis.com/what_are_embeddings/


In [24]:
# can download embedding models from huggingface. using sentence transformers here.

In [25]:
# !pip install -U sentence-transformers

In [26]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981486e-02  3.03164609e-02 -2.01218035e-02  6.86484054e-02
 -2.55256072e-02 -8.47688783e-03 -2.07072138e-04 -6.32376745e-02
  2.81606335e-02 -3.33352908e-02  3.02634649e-02  5.30720614e-02
 -5.03526367e-02  2.62287930e-02  3.33313346e-02 -4.51578461e-02
  3.63044366e-02 -1.37113337e-03 -1.20171262e-02  1.14947371e-02
  5.04511185e-02  4.70856875e-02  2.11912617e-02  5.14607579e-02
 -2.03746445e-02 -3.58889662e-02 -6.67850894e-04 -2.94393003e-02
  4.95859645e-02 -1.05639212e-02 -1.52013600e-02 -1.31748675e-03
  4.48196381e-02  1.56023037e-02  8.60379885e-07 -1.21392484e-03
 -2.37978082e-02 -9.09449242e-04  7.34487036e-03 -2.53929570e-03
  5.23370095e-02 -4.68043499e-02  1.66214928e-02  4.71578799e-02
 -4.15599048e-02  9.01963038e-04  3.60278860e-02  3.42215225e-02
  9.68227163e-02  5.94828986e-02 -1.64984670e-02 -3.51249576e-02
  5.92511054e-03 -7.08015752e-04 -2.4103

In [27]:
embeddings[0].shape

(768,)

In [28]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97447408e-02 -4.51074727e-03 -4.98480070e-03  6.55444637e-02
 -9.87676531e-03  2.72835623e-02  3.66426483e-02 -3.30220326e-03
  8.50080699e-03  8.24949890e-03 -2.28497311e-02  4.02430892e-02
 -5.75200357e-02  6.33692592e-02  4.43207696e-02 -4.49507013e-02
  1.25284297e-02 -2.52012368e-02 -3.55292186e-02  1.29559264e-02
  8.67021549e-03 -1.92917809e-02  3.55632440e-03  1.89505927e-02
 -1.47128645e-02 -9.39846039e-03  7.64174480e-03  9.62190982e-03
 -5.98929264e-03 -3.90169248e-02 -5.47824427e-02 -5.67454426e-03
  1.11645041e-02  4.08067182e-02  1.76319111e-06  9.15295631e-03
 -8.77262745e-03  2.39382870e-02 -2.32784320e-02  8.04998875e-02
  3.19176428e-02  5.12597337e-03 -1.47708645e-02 -1.62524860e-02
 -6.03213087e-02 -4.35689315e-02  4.51210998e-02 -1.79054104e-02
  2.63366662e-02 -3.47866528e-02 -8.89173429e-03 -5.47674820e-02
 -1.24372374e-02 -2.38606799e-02  8.33496749e-02  5.71242459e-02
  1.13328742e-02 -1.49595216e-02  9.2037

In [29]:
%%time
# embed our chunks
embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_length):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/618 [00:00<?, ?it/s]

CPU times: total: 1min 16s
Wall time: 7.66 s


In [30]:
# # Turn text chunks into a single list
# text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_length]

In [31]:
# len(text_chunks)

In [32]:
# %%time

# # embed all texts in batches
# text_chunk_embeddings = embedding_model.encode(text_chunks,
#                                               batch_size=32, # experiment with this
#                                               convert_to_tensor=True)
# text_chunk_embeddings

### Save embeddings to file

In [35]:
# pages_and_chunks_over_min_token_length[419]

In [36]:
# text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
# text_chunks_and_embeddings_df

In [37]:
# save embeddings to a file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [38]:
# import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,4,Contents INTRODUCTION Goliath “Am I a dog that...,677,112,169.25,[ 5.94012700e-02 6.20729737e-02 -2.34815180e-...
1,4,PART THREE: THE LIMITS OF POWER SEVEN Rosemary...,205,36,51.25,[ 6.71318993e-02 1.13388568e-01 2.35880613e-...
2,5,André Trocmé “We feel obliged to tell you that...,128,21,32.0,[ 1.26770353e-02 5.31808361e-02 6.09793654e-...
3,8,"But the Lord said to Samuel, “Do not look on h...",247,48,61.75,[ 4.04367410e-03 3.20881829e-02 -2.46701147e-...
4,9,Introduction Goliath “AM I A DOG THAT YOU SHOU...,1118,193,279.5,[-2.74488553e-02 5.33705093e-02 -7.90666975e-...


if the embedding database is really large (eg. over 100k-1M samples), you might want to look into using vector database for storage.

# RAG - Search and Answer

Retrieve relevant passage bassed on a query and use those passages to augment an input to an LLM so it can generate an output based on those relevant passages.

In [39]:
# similarity search/ semantic search/ vector search

In [40]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([618, 768])

In [42]:
# text_chunks_and_embeddings_df

In [44]:
# create model
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

Embedding Model Ready.

Create a small semantic search pipeline.

In essence, we want to search for a query and get back relevant passages from our textbook.

We can do so with the following steps:
1. Define the query string.
2. Turn the query string into an embedding.
3. Perform a dot product or cosine similarity function between the text embeddings and query embedding.
4. Sort the results from 3 in desending order  
    

NOTE: to use dot products for comparison, ensure vectors are of same shape (eg. 768) and tensors/ vectors are in the same datatype (e.g. both are in float32).

In [45]:
# 1. define the query
query = "goliath"
print(f"Query: {query}")

# 2. Embed the query
# NOTE: it is important to embed your query with the same model used for embedding your passages
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

# 3. get the similarity scores with the dot product (use cosine similarity if outputs of model aren't normalized)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time: .5f} seconds.")

# 4. get the top-k results (we'll keep this 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: goliath
[INFO] Time taken to get scores on 618 embeddings:  0.05482 seconds.


torch.return_types.topk(
values=tensor([0.6283, 0.6070, 0.6064, 0.6003, 0.5990], device='cuda:0'),
indices=tensor([548,  25,  23,  21,  14], device='cuda:0'))