In [3]:
import fitz # requires PyMuPDF
from tqdm.auto import tqdm

In [4]:
pdf_path = "../data/ben-hogan.pdf"

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:10]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 1,
  'page_char_count': 125,
  'page_word_count': 20,
  'page_sentence_count_raw': 1,
  'page_token_count': 31.25,
  'text': 'Ben Hogan’s Five Lessons The Modern Fundamentals of Golf Ben Hogan, with Herbert Warren Wind and Drawings by Anthony Ravielli'},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 3,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 4,
  'page_char_count': 59,
  'page_word_count': 8,
  'page_sentence_count_raw': 1,
  'page_token_count': 14.75,
  'text': 'Digital Edition Published 2018 House of Majied Publications'},
 {'page_number': 5,
  'page_char_count': 154,
  'page_word_count': 29,
  'page_sentence_count_raw'

In [22]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 71,
  'page_char_count': 1857,
  'page_word_count': 353,
  'page_sentence_count_raw': 17,
  'page_token_count': 464.25,
  'text': 'The training exercise is a half-swing back and forth. Back and forth, back and forth, the body swings the arms like a pendulum of a clock. The elbows remain tightly glued to the sides Last but not least, practice the complete backswing. Try to visualize your proper plane and to keep your arms traveling on that plane as you swing the club back. Quite a few of my friends have told me that once they got the idea of the plane into their heads, it worked wonders for them. Like nothing else, it got them out of their old bad habits and made the correct movements come so naturally they could hardly believe it. I can believe it. I really never felt that my own backswing was satisfactorily grooved, or could be satisfactorily grooved, until I began to base my backswing on this concept of the plane. Up to that time — this was in 1938 — I had been strug

In [23]:
len(pages_and_texts)

115

In [24]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,0,1,1,0.0,
1,1,125,20,1,31.25,Ben Hogan’s Five Lessons The Modern Fundamenta...
2,2,0,1,1,0.0,
3,3,0,1,1,0.0,
4,4,59,8,1,14.75,Digital Edition Published 2018 House of Majied...


In [25]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,115.0,115.0,115.0,115.0,115.0
mean,57.0,1204.42,221.37,10.08,301.1
std,33.34,904.9,167.21,7.61,226.22
min,0.0,0.0,1.0,1.0,0.0
25%,28.5,282.0,52.0,3.0,70.5
50%,57.0,1223.0,225.0,10.0,305.75
75%,85.5,2053.5,370.0,16.0,513.38
max,114.0,2710.0,504.0,27.0,677.5


In [26]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

In [27]:
nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/115 [00:00<?, ?it/s]

In [28]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 0,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': '',
  'sentences': [],
  'page_sentence_count_spacy': 0}]

## Proposition based Chunking

In [29]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub

In [30]:
from dotenv import load_dotenv
load_dotenv()

True

In [31]:
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo')
runnable = obj | llm

class Sentences(BaseModel):
    sentences: List[str]
    
# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    propositions = extraction_chain.invoke(runnable_output)["text"][0].sentences
    return propositions

In [32]:
pages_and_texts[32]

{'page_number': 32,
 'page_char_count': 443,
 'page_word_count': 82,
 'page_sentence_count_raw': 2,
 'page_token_count': 110.75,
 'text': 'The feet should be set apart the width of the shoulders when the golfer prepares to play a standard five-iron shot Some tournament-caliber golfers, as you may have noticed, choose to stand with the toes of both feet pointed out. It has always struck me that these players succeed in spite of the placement of their feet, for I have been convinced since my early days in golf that THERE IS ONE CORRECT BASIC STANCE: THE RIGHT FOOT IS AT A RIGHT',
 'sentences': ['The feet should be set apart the width of the shoulders when the golfer prepares to play a standard five-iron shot Some tournament-caliber golfers, as you may have noticed, choose to stand with the toes of both feet pointed out.',
  'It has always struck me that these players succeed in spite of the placement of their feet, for I have been convinced since my early days in golf that THERE IS ONE C

In [33]:
pages_and_texts[10:29]

[{'page_number': 10,
  'page_char_count': 2068,
  'page_word_count': 374,
  'page_sentence_count_raw': 19,
  'page_token_count': 517.0,
  'text': '1   The Grip GOOD GOLF BEGINS WITH A GOOD GRIP. This statement, I realize, packs as much explosive punch as announcing the startling fact that the battery in baseball is composed of a pitcher and a catcher. Moreover, for most golfers the grip is the drabbest part of the swing. There’s no glamour to it. They see it accomplishing nothing active, nothing decisive. On the other hand, for myself and other serious golfers there is an undeniable beauty in the way a fine player sets his hands on the club. Walter Hagen, for instance, had a beautiful grip, delicate and at the same time powerful. It always looked to me as if Hagen’s hands had been especially designed to fit on a golf club. Of the younger players today, Jack Burke gets his hands on the club very handsomely. No doubt a professional golfer’s admiration for an impressive grip comes from hi

In [34]:
# text = "Text splitting in LangChain is a critical feature that facilitates the division of large texts into smaller, manageable segments. "
# paragraphs = text.split("\n\n")
text_propositions = []
for i, page in enumerate(pages_and_texts[10:]):
    propositions = get_propositions(page["text"])
    text_propositions.extend(propositions)
    print (f"Done with {i}")

print (f"You have {len(text_propositions)} propositions")
print(text_propositions[:10])

Done with 0
Done with 1
Done with 2
Done with 3
Done with 4
Done with 5
Done with 6
Done with 7
Done with 8
Done with 9
Done with 10
Done with 11
Done with 12
Done with 13
Done with 14
Done with 15
Done with 16
Done with 17
Done with 18
Done with 19
Done with 20
Done with 21
Done with 22
Done with 23
Done with 24
Done with 25
Done with 26
Done with 27
Done with 28
Done with 29
Done with 30
Done with 31
Done with 32
Done with 33
Done with 34
Done with 35
Done with 36
Done with 37
Done with 38
Done with 39
Done with 40
Done with 41
Done with 42
Done with 43
Done with 44
Done with 45
Done with 46
Done with 47
Done with 48
Done with 49
Done with 50
Done with 51
Done with 52
Done with 53
Done with 54
Done with 55
Done with 56
Done with 57
Done with 58
Done with 59
Done with 60
Done with 61
Done with 62
Done with 63
Done with 64
Done with 65
Done with 66
Done with 67
Done with 68
Done with 69
Done with 70
Done with 71
Done with 72
Done with 73
Done with 74
Done with 75
Done with 76
Done with

In [50]:
text_propositions

["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
 'The battery in baseball is composed of a pitcher and a catcher.',
 'The grip is often seen as the drabbest part of the swing by most golfers.',
 'Most golfers perceive the grip as accomplishing nothing active or decisive.',
 'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.',
 'Jack Burke also has a very impressive way of placing his hands on the club.',
 "A professional golfer's admiration for an impressive grip stems from knowing that the grip is essential for the action of the golf swing.",
 "The grip is the player's only contact point with the ball through the clubhead.",
 'The power in the golf swing is generated by the body and transferred through the arms to the hands and then to the clubhead.',
 'The power multiplies itself with each transfer, similar to a chain action in physics.',
 'A proper grip is crucial for the chain action in the golf swing to 

In [51]:
import numpy as np

np_array = np.array(text_propositions)

np.save('text_propositions.npy', text_propositions)


In [100]:
with open('text_propositions.txt', 'w') as file:
    for item in text_propositions:
        file.write(f"{item}\n")

import pickle
with open('text_propositions.pkl', 'wb') as file:
    pickle.dump(text_propositions, file)

In [54]:
from semantic_router.encoders import OpenAIEncoder
encoder = OpenAIEncoder(name='text-embedding-3-small')

In [55]:
from semantic_chunkers import StatisticalChunker
chunker = StatisticalChunker(encoder=encoder)

In [57]:
text_propositions[:5]

["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
 'The battery in baseball is composed of a pitcher and a catcher.',
 'The grip is often seen as the drabbest part of the swing by most golfers.',
 'Most golfers perceive the grip as accomplishing nothing active or decisive.',
 'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.']

In [76]:
content = " ".join(text_propositions)

In [77]:
content

"The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf. The battery in baseball is composed of a pitcher and a catcher. The grip is often seen as the drabbest part of the swing by most golfers. Most golfers perceive the grip as accomplishing nothing active or decisive. Walter Hagen had a beautiful grip on the golf club, delicate yet powerful. Jack Burke also has a very impressive way of placing his hands on the club. A professional golfer's admiration for an impressive grip stems from knowing that the grip is essential for the action of the golf swing. The grip is the player's only contact point with the ball through the clubhead. The power in the golf swing is generated by the body and transferred through the arms to the hands and then to the clubhead. The power multiplies itself with each transfer, similar to a chain action in physics. A proper grip is crucial for the chain action in the golf swing to work effectively. A golfer with a defecti

In [78]:
chunks = chunker(docs=[content])

[32m2024-06-30 16:14:02 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.[0m


In [73]:
from pprint import pprint

In [88]:
chunks[0][0].splits

["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
 'The battery in baseball is composed of a pitcher and a catcher.',
 'The grip is often seen as the drabbest part of the swing by most golfers.',
 'Most golfers perceive the grip as accomplishing nothing active or decisive.',
 'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.',
 'Jack Burke also has a very impressive way of placing his hands on the club.',
 "A professional golfer's admiration for an impressive grip stems from knowing that the grip is essential for the action of the golf swing.",
 "The grip is the player's only contact point with the ball through the clubhead.",
 'The power in the golf swing is generated by the body and transferred through the arms to the hands and then to the clubhead.']

In [107]:
chunk_list = []
for chunk in chunks[0]:
    pprint(chunk.splits)
    chunk_list.append(chunk.splits)
    print("----------------")

AttributeError: 'str' object has no attribute 'splits'

In [106]:
chunk_list

[array(["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
        'The battery in baseball is composed of a pitcher and a catcher.',
        'The grip is often seen as the drabbest part of the swing by most golfers.',
        'Most golfers perceive the grip as accomplishing nothing active or decisive.',
        'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.',
        'Jack Burke also has a very impressive way of placing his hands on the club.',
        "A professional golfer's admiration for an impressive grip stems from knowing that the grip is essential for the action of the golf swing.",
        "The grip is the player's only contact point with the ball through the clubhead.",
        'The power in the golf swing is generated by the body and transferred through the arms to the hands and then to the clubhead.'],
       dtype='<U137'),
 array(['The power multiplies itself with each transfer, similar to a ch

In [99]:
with open('chunks.txt', 'w') as file:
    for item in chunk_list:
        file.write(f"{item}\n")

import pickle
with open('chunks.pkl', 'wb') as file:
    pickle.dump(chunk_list, file)


In [6]:
import pickle
with open('chunks.pkl', 'rb') as file:
    chunks = pickle.load(file)

# Print the loaded list
# print(chunks)

In [7]:
chunks = [list(chunk) for chunk in chunks]

In [113]:
chunks[0]

["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
 'The battery in baseball is composed of a pitcher and a catcher.',
 'The grip is often seen as the drabbest part of the swing by most golfers.',
 'Most golfers perceive the grip as accomplishing nothing active or decisive.',
 'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.',
 'Jack Burke also has a very impressive way of placing his hands on the club.',
 "A professional golfer's admiration for an impressive grip stems from knowing that the grip is essential for the action of the golf swing.",
 "The grip is the player's only contact point with the ball through the clubhead.",
 'The power in the golf swing is generated by the body and transferred through the arms to the hands and then to the clubhead.']

In [8]:
chunk_joined = []
for chunk in chunks:
    chunk_joined.append(" ".join(chunk))

In [13]:
from langchain_community.embeddings import OllamaEmbeddings

In [14]:
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

local_llm = ChatOllama(model="mistral")

# RAG
def rag(chunks, collection_name):
    vectorstore = Chroma.from_documents(
        documents=documents,
        collection_name=collection_name,
        embedding=OllamaEmbeddings(model='nomic-embed-text'),
    )
    retriever = vectorstore.as_retriever()

    prompt_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke("How should the wrist be positioned during impact?")
    print(result)

In [16]:
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunk_joined]


In [19]:
rag(documents, "agentic-chunks")

In [None]:
vectorstore = Chroma.from_documents(
    documents=documents,
    collection_name="agentic-chunks",
    embedding=OllamaEmbeddings(model='nomic-embed-text'),
)
retriever = vectorstore.as_retriever()


In [26]:
prompt_template = """Answer the question based only on the following context:
{context}
Question: {question}
"""

prompt_template = """
### Ben Hogan Chatbot Instruction

**Role:** You are Ben Hogan, the legendary golfer, known for your precise ball-striking, meticulous approach to the game, and deep understanding of golf fundamentals.

**Tone:** Calm, authoritative, encouraging, and reflective of the 1950s era.

**Objective:** Answer user questions about golf, specifically focusing on golf swing techniques, mindset, philosophy, and love for the game, based on the book "Five Lessons: The Fundamentals of Golf."

#### Specific Instructions

**1. Provide Expert Golf Swing Advice:**
   - Break down the golf swing into its fundamental components: grip, stance and posture, the first part of the swing, and the second part of the swing.
   - Use detailed explanations from the book "Five Lessons: The Fundamentals of Golf."
   - Example:
     - User: "How should I hold the club to improve my grip?"
     - Ben Hogan: "The grip is the foundation of your swing. Hold the club in your fingers, not your palm. The V formed by your thumb and index finger should point towards your right shoulder. A proper grip ensures control and consistency."

**2. Share Mindset and Philosophy:**
   - Discuss the mental approach to golf, emphasizing focus, discipline, and the importance of practice.
   - Share personal anecdotes and insights from your career.
   - Example:
     - User: "What mindset should I have when approaching a difficult shot?"
     - Ben Hogan: "Golf is as much a mental game as it is a physical one. Approach each shot with confidence and focus. Visualize the perfect shot, trust your swing, and stay calm. Every challenge is an opportunity to improve."

**3. Encourage and Motivate:**
   - Provide motivational support and encouragement.
   - Reinforce the idea that improvement comes with practice and dedication.
   - Highlight the joy and fulfillment of playing golf.
   - Example:
     - User: "How important is practice in becoming a good golfer?"
     - Ben Hogan: "Practice is the bedrock of success in golf. Consistent, deliberate practice hones your skills and builds muscle memory. Dedication to practice will pay off on the course."

**4. Reflect on Philosophy and Love for Golf:**
   - Discuss your philosophy on golf and life.
   - Share why you love golf and what it means to you.
   - Example:
     - User: "What philosophy did you follow throughout your golf career?"
     - Ben Hogan: "My philosophy was simple: strive for perfection in every aspect of the game. Understand the fundamentals, work tirelessly to improve, and never settle for mediocrity. Golf is a journey of continuous learning and growth."

**Guidelines:**
- Always base responses on the teachings from "Five Lessons: The Fundamentals of Golf."
- Be informative, supportive, and engaging.
- Use quotes and references from your book to lend authenticity and depth to your answers.

Answer the question based only on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | local_llm
    | StrOutputParser()
)

In [25]:
result = chain.invoke("How should the left hand be positioned during impact?")
print(result)

In [None]:
result = chain.invoke("I have a problem transferring from right to left")
print(result)

In [27]:
result = chain.invoke("What are some tips on gaining more distance?")
print(result)

In [104]:
import importlib
import agentic_chunker

importlib.reload(agentic_chunker)
from agentic_chunker import AgenticChunker


ac = AgenticChunker()
ac.print_logging = False
ac.add_propositions(text_propositions[:5])
print(ac.pretty_print_chunks())
chunks = ac.get_chunks(get_type='list_of_strings')
print(chunks)

In [105]:
chunks

["The statement 'Good golf begins with a good grip' emphasizes the importance of the grip in golf.",
 'The battery in baseball is composed of a pitcher and a catcher.',
 'The grip is often seen as the drabbest part of the swing by most golfers. Most golfers perceive the grip as accomplishing nothing active or decisive.',
 'Walter Hagen had a beautiful grip on the golf club, delicate yet powerful.']

In [23]:
len(chunks)

123