In [31]:
import fitz # requires PyMuPDF
from tqdm.auto import tqdm

In [32]:
pdf_path = "../data/ben-hogan.pdf"

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:10]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 1,
  'page_char_count': 125,
  'page_word_count': 20,
  'page_sentence_count_raw': 1,
  'page_token_count': 31.25,
  'text': 'Ben Hogan’s Five Lessons The Modern Fundamentals of Golf Ben Hogan, with Herbert Warren Wind and Drawings by Anthony Ravielli'},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 3,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 4,
  'page_char_count': 59,
  'page_word_count': 8,
  'page_sentence_count_raw': 1,
  'page_token_count': 14.75,
  'text': 'Digital Edition Published 2018 House of Majied Publications'},
 {'page_number': 5,
  'page_char_count': 154,
  'page_word_count': 29,
  'page_sentence_count_raw'

In [33]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 10,
  'page_char_count': 2068,
  'page_word_count': 374,
  'page_sentence_count_raw': 19,
  'page_token_count': 517.0,
  'text': '1   The Grip GOOD GOLF BEGINS WITH A GOOD GRIP. This statement, I realize, packs as much explosive punch as announcing the startling fact that the battery in baseball is composed of a pitcher and a catcher. Moreover, for most golfers the grip is the drabbest part of the swing. There’s no glamour to it. They see it accomplishing nothing active, nothing decisive. On the other hand, for myself and other serious golfers there is an undeniable beauty in the way a fine player sets his hands on the club. Walter Hagen, for instance, had a beautiful grip, delicate and at the same time powerful. It always looked to me as if Hagen’s hands had been especially designed to fit on a golf club. Of the younger players today, Jack Burke gets his hands on the club very handsomely. No doubt a professional golfer’s admiration for an impressive grip comes from hi

In [34]:
len(pages_and_texts)

115

In [35]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,0,1,1,0.0,
1,1,125,20,1,31.25,Ben Hogan’s Five Lessons The Modern Fundamenta...
2,2,0,1,1,0.0,
3,3,0,1,1,0.0,
4,4,59,8,1,14.75,Digital Edition Published 2018 House of Majied...


In [36]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,115.0,115.0,115.0,115.0,115.0
mean,57.0,1204.42,221.37,10.08,301.1
std,33.34,904.9,167.21,7.61,226.22
min,0.0,0.0,1.0,1.0,0.0
25%,28.5,282.0,52.0,3.0,70.5
50%,57.0,1223.0,225.0,10.0,305.75
75%,85.5,2053.5,370.0,16.0,513.38
max,114.0,2710.0,504.0,27.0,677.5


In [37]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

In [38]:
nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/115 [00:00<?, ?it/s]

In [39]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 11,
  'page_char_count': 242,
  'page_word_count': 42,
  'page_sentence_count_raw': 3,
  'page_token_count': 60.5,
  'text': 'A golfer’s power is originated and generated by the movements of the body. This power is transferred from the player’s body to his arms and then to his hands. It multiplies itself enormously with every transfer, like a chain action in physics',
  'sentences': ['A golfer’s power is originated and generated by the movements of the body.',
   'This power is transferred from the player’s body to his arms and then to his hands.',
   'It multiplies itself enormously with every transfer, like a chain action in physics'],
  'page_sentence_count_spacy': 3}]

## Proposition based Chunking

In [40]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub

In [45]:
from dotenv import load_dotenv
load_dotenv()

True

In [46]:
from agentic_chunker import AgenticChunker
ac = AgenticChunker()

In [48]:
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo')
runnable = obj | llm

class Sentences(BaseModel):
    sentences: List[str]
    
# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    propositions = extraction_chain.invoke(runnable_output)["text"][0].sentences
    return propositions

  warn_deprecated(


In [57]:
pages_and_texts[32]

{'page_number': 32,
 'page_char_count': 443,
 'page_word_count': 82,
 'page_sentence_count_raw': 2,
 'page_token_count': 110.75,
 'text': 'The feet should be set apart the width of the shoulders when the golfer prepares to play a standard five-iron shot Some tournament-caliber golfers, as you may have noticed, choose to stand with the toes of both feet pointed out. It has always struck me that these players succeed in spite of the placement of their feet, for I have been convinced since my early days in golf that THERE IS ONE CORRECT BASIC STANCE: THE RIGHT FOOT IS AT A RIGHT',
 'sentences': ['The feet should be set apart the width of the shoulders when the golfer prepares to play a standard five-iron shot Some tournament-caliber golfers, as you may have noticed, choose to stand with the toes of both feet pointed out.',
  'It has always struck me that these players succeed in spite of the placement of their feet, for I have been convinced since my early days in golf that THERE IS ONE C

In [50]:
# text = "Text splitting in LangChain is a critical feature that facilitates the division of large texts into smaller, manageable segments. "
# paragraphs = text.split("\n\n")
text_propositions = []
for i, page in enumerate(pages_and_texts):
    propositions = get_propositions(page["text"])
    text_propositions.extend(propositions)
    print (f"Done with {i}")

print (f"You have {len(text_propositions)} propositions")
print(text_propositions[:10])