In [1]:
# download pdf

In [2]:
import os
import requests

# get the pdf document path
pdf_path = r"David and Goliath.pdf"

# download pdf
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading....")

    # enter the path of the pdf
    url = 'https://core-docs.s3.amazonaws.com/documents/asset/uploaded_file/222892/david-and-goliath.pdf'

    # the local filename of the downloaded file
    filename = pdf_path

    # send a get request to the url
    response = requests.get(url)

    # check if the request was successful
    if response.status_code == 200:
        # open the file and save it
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists")

File David and Goliath.pdf exists


In [3]:
# open and preprocess pdf

In [4]:
import fitz # requires: !pip install PyMuPDF
from tqdm.auto import tqdm # pip install tqdm

def text_formatter(text: str) -> str:
    """ Performs minor formatting on text. """
    cleaned_text = text.replace("\n", " ").strip()

    # potentially more text formatting functions go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text = text)
        pages_and_texts.append({"page_number" : page_number + 1, # page_number - 5,6 (adjust accoring to your pdf's starting page)
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~ 4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path = pdf_path)
pages_and_texts[:10]

0it [00:00, ?it/s]

[{'page_number': 1,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 3,
  'page_char_count': 86,
  'page_word_count': 15,
  'page_sentence_count_raw': 1,
  'page_token_count': 21.5,
  'text': 'Malcolm Gladwell   david and goliath Underdogs, Misfits and the Art of Battling Giants'},
 {'page_number': 4,
  'page_char_count': 891,
  'page_word_count': 156,
  'page_sentence_count_raw': 5,
  'page_token_count': 222.75,
  'text': 'Contents INTRODUCTION Goliath “Am I a dog that you should come to me with sticks?” PART ONE: THE ADVANTAGES OF DISADVANTAGES (AND THE DISADVANTAGES OF ADVANTAGES) ONE Vivek Ranadivé “It was really random. I mean, my father had never played basketball before.” TWO Teresa DeBrito “My largest class was twenty-nine kids. Oh, it was fun.” THRE

In [5]:
import random
random.sample(pages_and_texts, k = 3)

[{'page_number': 58,
  'page_char_count': 3182,
  'page_word_count': 581,
  'page_sentence_count_raw': 33,
  'page_token_count': 795.5,
  'text': 'having a brain so sluggish that when it comes to putting together the building blocks of words, those crucial 40 milliseconds simply go by too quickly? “If you have no concept of the sounds of language—if you take away a letter, if you take away a sound, and you don’t know what to do, then it’s really hard to map the sounds to the written counterparts,” Nadine Gaab, a dyslexia researcher at Harvard, explained. “It may take you a while to learn to read. You read really slowly, which then impairs your reading fluency, which then impairs your reading comprehension, because you’re so slow that by the time you’re at the end of the sentence, you’ve forgotten what the beginning of the sentence was. So it leads to all these problems in middle school or high school. Then it starts affecting all other subjects in school. You can’t read. How are you go

In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08
std,60.77,1563.33,269.38,16.45,390.83
min,1.0,0.0,1.0,1.0,0.0
25%,53.25,521.25,94.25,5.0,130.31
50%,105.5,2741.0,454.5,23.5,685.25
75%,157.75,3537.5,616.5,35.75,884.38
max,210.0,5888.0,955.0,65.0,1472.0


Why would we care about Token Counts? 

Token Count is important to think about because: 
1. Embedding models don't deal with infinite tokens.
2. LLMs don't deal with infinite tokens.

For example, an embedding model may have been trained to embed sequences of 384 tokens into numerical space. (sentence transformers 'all-mpnet-base-v2', https://www.sbert.net/docs/pretrained_models.html)

As for LLMs, they can't accept infinite tokens in their context window.

### further text processing. (splitting pages into sentences)

 two ways to do this
 1. by splitting on ". "
 2. with a NLP library such as Spacy and NLTK. 

In [7]:
#!pip install spacy

In [8]:
from spacy.lang.en import English

nlp = English()

# add a sentencizer pipeline, https://spacy.io/api/sentencizer
nlp.add_pipe("sentencizer")

# create example instance of a document 
doc = nlp("Hi. How are you? What is your name? Good to see you.")
assert len(list(doc.sents)) == 4

list(doc.sents)

[Hi., How are you?, What is your name?, Good to see you.]

In [9]:
pages_and_texts[42]

{'page_number': 43,
 'page_char_count': 3827,
 'page_word_count': 659,
 'page_sentence_count_raw': 36,
 'page_token_count': 956.75,
 'text': 'in defying and attracting criticism, coming face-to-face with the big public.” But the very things that made the Salon so attractive—how selective and prestigious it was—also made it problematic. The Palais was an enormous barn of a building three hundred yards long with a central aisle that was two stories high. A typical Salon might accept three or four thousand paintings, and they were hung in four tiers, starting at ground level and stretching up to the ceiling. Only paintings that met with the unanimous approval of the jury were hung “on the line,” at eye level. If you were “skyed”—that is, hung closest to the ceiling—it was all but impossible for your painting to be seen. (One of Renoir’s paintings was once skyed in the dépotoir.) No painter could submit more than three works. The crowds were often overwhelming. The Salon was the Big Pond. 

In [10]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # make sure all the sentences are strings (the default datatype is a spacy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/210 [00:00<?, ?it/s]

In [11]:
random.sample(pages_and_texts, k = 1)

[{'page_number': 13,
  'page_char_count': 3286,
  'page_word_count': 589,
  'page_sentence_count_raw': 26,
  'page_token_count': 821.5,
  'text': 'expecting had suddenly changed shape. “You come against me with sword and spear and javelin,” David said to Goliath, “but I come against you in the name of the Lord Almighty, the God of the armies of Israel, whom you have defied. This day the Lord will deliver you into my hands, and I’ll strike you down and cut off your head.… All those gathered here will know that it is not by sword or spear that the Lord saves; for the battle is the Lord, and he will give all of you into our hands.” Twice David mentions Goliath’s sword and spear, as if to emphasize how profoundly different his intentions are. Then he reaches into his shepherd’s bag for a stone, and at that point no one watching from the ridges on either side of the valley would have considered David’s victory improbable. David was a slinger, and slingers beat infantry, hands down. “Goliath

In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,210.0,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08,25.62
std,60.77,1563.33,269.38,16.45,390.83,19.12
min,1.0,0.0,1.0,1.0,0.0,0.0
25%,53.25,521.25,94.25,5.0,130.31,5.0
50%,105.5,2741.0,454.5,23.5,685.25,28.0
75%,157.75,3537.5,616.5,35.75,884.38,41.0
max,210.0,5888.0,955.0,65.0,1472.0,70.0


### chunking the sentences together (splitting larger pieces into smaller ones is referred to as splitting or chunking). 
no 100% correct way to do this.
split into groups of 10 sentences for simplicity. the number can be changed.
frameworks like langchain might help. but doing this purely on python.

Why we do this?
1. so our texts are easier to filter. smaller groups of text can be easier to inspect than the large passages.
2. so our texts chunks fit into our embedding model context window (e.g. 384 tokens as limit).
3. so our contexts passed into LLMs can be specific and focused.

In [13]:
# define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# create a function to split lists of text recursively into chunk size
# e.g. [20] -> [10, 10], [25] -> [10, 10, 5]
def split_list(input_list: list[str], slice_size: int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [14]:
# loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list= item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/210 [00:00<?, ?it/s]

In [15]:
random.sample(pages_and_texts, k=1)

[{'page_number': 145,
  'page_char_count': 5047,
  'page_word_count': 850,
  'page_sentence_count_raw': 43,
  'page_token_count': 1261.75,
  'text': 'Portugal 19.7 Spain 19.8 Italy 22.2 Greece 25.2 Ireland 13.2 The list is from Friedrich Schneider’s “The Influence of the Economic Crisis on the Underground Economy in Germany and other OECD-countries in 2010” (unpublished paper, revised edition, January 2010). The list is not surprising. American, Swiss, and Japanese taxpayers are pretty honest. So are most of the other Western European democracies. Greece, Spain, and Italy are not. In fact, the level of tax evasion in Greece is such that the country’s deficit—which is so large that Greece has teetered on the brink of outright bankruptcy for years—would all but disappear if Greek citizens obeyed the law and paid what they owed. Why is America so much more law- abiding when it comes to taxes than Greece? Leites and Wolf would attribute that to the fact that the costs of tax evasion in the

In [16]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,105.5,2228.32,387.0,21.99,557.08,25.62,3.05
std,60.77,1563.33,269.38,16.45,390.83,19.12,1.85
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,53.25,521.25,94.25,5.0,130.31,5.0,1.0
50%,105.5,2741.0,454.5,23.5,685.25,28.0,3.0
75%,157.75,3537.5,616.5,35.75,884.38,41.0,5.0
max,210.0,5888.0,955.0,65.0,1472.0,70.0,7.0


### Splitting each chunk into its own item

embed chunk of sentences into its own numerical representation. that'll give us a good granularity.

meaning, we can dive specifically into the text sample that was used in our model.

In [17]:
import re

#split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # join the sentences together into a paragraph like structure (join the list of sentences into one paragraph)
        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        #get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 #1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)
        

  0%|          | 0/210 [00:00<?, ?it/s]

641

In [18]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 119,
  'sentence_chunk': 'Parole was being granted too easily and too quickly. Chronic offenders were being treated no differently than people who were committing crimes for the first time. Douglas Walker, the man on the back of the motorcycle, had his first run-in with the law when he was thirteen years old for trafficking heroin. He had recently been given a temporary release so he could visit his pregnant wife, and he had never returned. Did that make sense?The group put together a proposal. At Reynolds’s insistence, it was short and simple, written in laymen’s language. It became known as the Three Strikes Law. Anyone convicted of a second serious or criminal offense in California, it stated, would have to serve double the sentence currently on the books. And anyone convicted of a third offense—and the definition of a third offense included every crime imaginable—would run out of chances entirely and serve a mandatory sentence of twenty-five years to life.*',
  'ch

In [19]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,641.0,641.0,641.0,641.0
mean,91.22,728.18,125.61,182.04
std,49.89,353.93,58.94,88.48
min,3.0,2.0,1.0,0.5
25%,52.0,498.0,87.0,124.5
50%,89.0,718.0,126.0,179.5
75%,127.0,954.0,163.0,238.5
max,210.0,2112.0,358.0,528.0


In [20]:
528 - 384

144

In [21]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,3,Malcolm Gladwell david and goliath Underdogs...,86,15,21.5
1,4,Contents INTRODUCTION Goliath “Am I a dog that...,677,112,169.25
2,4,PART THREE: THE LIMITS OF POWER SEVEN Rosemary...,205,36,51.25
3,5,André Trocmé “We feel obliged to tell you that...,128,21,32.0
4,6,BY THE SAME AUTHOR What the Dog Saw Outliers B...,68,13,17.0


### filter chunks of text for short chunks
these chunks may not contain much useful information

In [22]:
# show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 8.0 | Text: or even this: What if it’s this?
Chunk token count: 10.25 | Text: I wouldn’t be where I am today without my
Chunk token count: 25.75 | Text: Alcohol is not inherently good or bad or neutral. It starts out good, becomes neutral, and ends up bad.
Chunk token count: 18.25 | Text: And that’s the real issue—what can be done to enliven, enrich, and engage
Chunk token count: 9.0 | Text: * “Stephen Randolph” is a pseudonym.


In [23]:
# filter our dataframe for rows with under 30 tokens
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_length[:2]

[{'page_number': 4,
  'sentence_chunk': 'Contents INTRODUCTION Goliath “Am I a dog that you should come to me with sticks?”PART ONE: THE ADVANTAGES OF DISADVANTAGES (AND THE DISADVANTAGES OF ADVANTAGES) ONE Vivek Ranadivé “It was really random. I mean, my father had never played basketball before.”TWO Teresa DeBrito “My largest class was twenty-nine kids. Oh, it was fun.”THREE Caroline Sacks “If I’d gone to the University of Maryland, I’d still be in science.”PART TWO: THE THEORY OF DESIRABLE DIFFICULTY FOUR David Boies You wouldn’t wish dyslexia on your child. Or would you?FIVE Emil “Jay” Freireich “How Jay did it, I don’t know.”SIX Wyatt Walker “De rabbit is de slickest o’ all de animals de Lawd ever made.”',
  'chunk_char_count': 677,
  'chunk_word_count': 112,
  'chunk_token_count': 169.25},
 {'page_number': 4,
  'sentence_chunk': 'PART THREE: THE LIMITS OF POWER SEVEN Rosemary Lawlor “I wasn’t born that way. This was forced upon me.”EIGHT Wilma Derksen “We have all done something 

In [24]:
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': 114,
  'sentence_chunk': 'The most important man in the community was the local priest. He came running. He went up to the soldiers. The raid must be done quickly, he warned them, or there would be trouble. Forty-five minutes passed, and the soldiers emerged with their haul: fifteen pistols, a rifle, a Schmeisser submachine gun, and a cache of explosives and ammunition. The patrol packed up and left, turning onto a side street that would take them out of the Lower Falls. In the interim, however, a small crowd had gathered, and as the armored cars turned the corner, a number of young men ran forward and started throwing stones at the soldiers. The patrol stopped. The crowd grew angry. The soldiers responded with tear gas.',
  'chunk_char_count': 704,
  'chunk_word_count': 123,
  'chunk_token_count': 176.0}]

### Embedding the text chunks