In [43]:
from trim_pdf import trim_pdf_before_page_one
from pdf_processor import find_page_number


In [44]:
input_path = 'inputPDF.pdf'

In [45]:
output_path = 'trimmed_' + input_path

In [46]:
page_idx = find_page_number(input_path)
trim_pdf_before_page_one(input_path, output_path, page_one_index=page_idx)


index is 5 and the page number is 2


In [5]:
from langchain.document_loaders import PyPDFLoader

In [6]:
# Load the PDF where each page is a document
loader = PyPDFLoader("trimmed_inputPDF.pdf")


In [7]:
pages = loader.load()

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = []
metadatas = []
for page in pages:
    page_chunks = text_splitter.split_documents([page])
    for chunk in page_chunks:
        chunks.append(chunk)
        # Propagate page number metadata from the parent page
        metadatas.append({"page": page.metadata["page"]})

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings


In [11]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [14]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document


In [15]:
documents = []
for i, chunk in enumerate(chunks, start=1):
    # Assume chunk.page_content is the text content
    documents.append(Document(page_content=chunk.page_content, metadata={"page": i}))


In [19]:
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name="book_by_page",
    persist_directory="embedding"
)

In [18]:
vector_store.persist()


  vector_store.persist()


In [24]:
query = "explain chapter 3"
# Use a filter to get documents with page 51
filtered_chunks = vector_store.similarity_search(
    query,
    k=5,
    filter={"page": 51}
)

In [23]:
filtered_chunks

[Document(metadata={'page': 51}, page_content='together, we get a logical contradiction, right? And we can’t allow that. This means that we have to reject one or both statements.  Anu: Exactly. You reject statement 2, and I reject statement 1. So we arrive at different conclusions. We are both rational, Neel, but only one of us supports the true statement.  Neel: And who’s that?  Anu: Me, of course! (laughs) THINK & DO #3  What reasons can you think of to justify each of these positions?   1. White crows don’t exist.   2. The white bird Anu has been seeing every morning is a crow. The next morning, Anu saw the white bird again and hurried to wake Neel up. Neel dragged himself sleepily to the balcony and stared at the bird Anu was pointing at. Neel: I do see a white bird that looks just like a crow. But, something tells me it’s not a crow.  Anu: You need to do better than that, Neel. Give me some reason for that judgment.  Neel: You mean that I have to give you a rational justification 

In [28]:
from PyPDF2 import PdfReader
import re

In [29]:
reader = PdfReader('trimmed_inputPDF.pdf')
reader.outline

[]

In [30]:
chapters = []
for page_num in range(len(reader.pages)):
    page = reader.pages[page_num]
    text = page.extract_text()
    # Adjust regex pattern to match your chapter headings
    match = re.search(r'^(Chapter \d+:|CHAPTER \d+)', text, re.MULTILINE)
    if match:
        chapters.append({
            "title": match.group(0).strip(),
            "start_page": page_num + 1  # 1-based
        })

In [34]:
for page_num, page in enumerate(reader.pages):
        text = page.extract_text().lower()
        if "index" in text or "contents" in text:
            index_page = page_num + 1  # 1-based numbering
            index_text = page.extract_text()
            break

In [35]:
index_page

3

In [38]:
index_text

'     \n 3 Foundations  of Knowledge and Inquiry Across Disciplines  TABLE OF CONTENTS          ACKNOWLEDGEMENTS …………………………………………… 4  TO THE LEARNER ……………………………………………………… 5  Part 1: INTRODUCTION  ……………………………………… 7  Chapter 1 What this Book is About  …………………………… 9  Chapter 2 A Broad Overview ……………………………………… 23  Chapter 3 Ways of Knowing ……………………………………… 35  Part 1: Looking Back ………………………………………………… 47  Part 2: LOGIC AND REASONING IN INQUIRY  ……………… 53  Chapter 4 Introduction to Reasoning …………………………… 55  Chapter 5 Judging the Truth of Assertions ………………………  69  Chapter 6 Language, Logic, and Truth in Academic Inquiry … 79  Part 2: Looking Back ………………………………………………… 89  Part 3: MORE TOOLS OF INQUIRY …………………………… 91  Chapter 7  Classifying ………………………………………………… 93  Chapter 8 Generalising …………………………………………… 113  Chapter 9  Defining ………………………………………………… 129  Part 3: Looking Back ………………………………………………… 141  Part 4: INTEGRATING THE PIECES …………………………… 143  Chapter 10 Justifying ………………………………………………… 145  Chap

In [39]:
lines = index_text.split('\n')


In [40]:
lines

['     ',
 ' 3 Foundations  of Knowledge and Inquiry Across Disciplines  TABLE OF CONTENTS          ACKNOWLEDGEMENTS …………………………………………… 4  TO THE LEARNER ……………………………………………………… 5  Part 1: INTRODUCTION  ……………………………………… 7  Chapter 1 What this Book is About  …………………………… 9  Chapter 2 A Broad Overview ……………………………………… 23  Chapter 3 Ways of Knowing ……………………………………… 35  Part 1: Looking Back ………………………………………………… 47  Part 2: LOGIC AND REASONING IN INQUIRY  ……………… 53  Chapter 4 Introduction to Reasoning …………………………… 55  Chapter 5 Judging the Truth of Assertions ………………………  69  Chapter 6 Language, Logic, and Truth in Academic Inquiry … 79  Part 2: Looking Back ………………………………………………… 89  Part 3: MORE TOOLS OF INQUIRY …………………………… 91  Chapter 7  Classifying ………………………………………………… 93  Chapter 8 Generalising …………………………………………… 113  Chapter 9  Defining ………………………………………………… 129  Part 3: Looking Back ………………………………………………… 141  Part 4: INTEGRATING THE PIECES …………………………… 143  Chapter 10 Justifying ………………………………………………… 145  

In [41]:
chapter_pattern = r'Chapter (\d+).*?(\d+)$'
    
chapters = []

for line in lines:
    line = line.strip()
    match = re.search(chapter_pattern, line)
    if match:
        chapter_num = int(match.group(1))
        page_num = int(match.group(2))
        chapters.append((chapter_num, page_num))

In [42]:
chapters

[(1, 203)]

In [43]:
sections = []
for line in lines:
    # Skip empty lines
    line = line.strip()
    if not line:
        continue
        
    # Look for the last sequence of digits in the line
    parts = line.split('…')
    if len(parts) < 2:
        continue
        
    # Get the section name and page number
    section_name = parts[0].strip()
    # Extract the last number from the right part
    page_number = ''.join(filter(str.isdigit, parts[-1].strip()))
    
    if page_number and section_name:
        sections.append((section_name, int(page_number)))

In [44]:
sections

[('3 Foundations  of Knowledge and Inquiry Across Disciplines  TABLE OF CONTENTS          ACKNOWLEDGEMENTS',
  203)]

In [45]:
lines = [line.strip() for line in text.split('\n') if line.strip()]


In [46]:
lines

['3 foundations  of knowledge and inquiry across disciplines  table of contents          acknowledgements …………………………………………… 4  to the learner ……………………………………………………… 5  part 1: introduction  ……………………………………… 7  chapter 1 what this book is about  …………………………… 9  chapter 2 a broad overview ……………………………………… 23  chapter 3 ways of knowing ……………………………………… 35  part 1: looking back ………………………………………………… 47  part 2: logic and reasoning in inquiry  ……………… 53  chapter 4 introduction to reasoning …………………………… 55  chapter 5 judging the truth of assertions ………………………  69  chapter 6 language, logic, and truth in academic inquiry … 79  part 2: looking back ………………………………………………… 89  part 3: more tools of inquiry …………………………… 91  chapter 7  classifying ………………………………………………… 93  chapter 8 generalising …………………………………………… 113  chapter 9  defining ………………………………………………… 129  part 3: looking back ………………………………………………… 141  part 4: integrating the pieces …………………………… 143  chapter 10 justifying ………………………………………………… 145  chapter 11 

In [47]:
full_text = ' '.join(lines)


In [48]:
full_text

'3 foundations  of knowledge and inquiry across disciplines  table of contents          acknowledgements …………………………………………… 4  to the learner ……………………………………………………… 5  part 1: introduction  ……………………………………… 7  chapter 1 what this book is about  …………………………… 9  chapter 2 a broad overview ……………………………………… 23  chapter 3 ways of knowing ……………………………………… 35  part 1: looking back ………………………………………………… 47  part 2: logic and reasoning in inquiry  ……………… 53  chapter 4 introduction to reasoning …………………………… 55  chapter 5 judging the truth of assertions ………………………  69  chapter 6 language, logic, and truth in academic inquiry … 79  part 2: looking back ………………………………………………… 89  part 3: more tools of inquiry …………………………… 91  chapter 7  classifying ………………………………………………… 93  chapter 8 generalising …………………………………………… 113  chapter 9  defining ………………………………………………… 129  part 3: looking back ………………………………………………… 141  part 4: integrating the pieces …………………………… 143  chapter 10 justifying ………………………………………………… 145  chapter 11 c

In [49]:
matches = re.findall(r'([^0-9…]+?)(?:…|\s)+(\d+)(?=\s|$)', full_text)


In [50]:
matches

[(' foundations  of knowledge and inquiry across disciplines  table of contents          acknowledgements',
  '4'),
 ('  to the learner', '5'),
 (': introduction', '7'),
 ('  chapter', '1'),
 (' what this book is about', '9'),
 ('  chapter', '2'),
 (' a broad overview', '23'),
 ('  chapter', '3'),
 (' ways of knowing', '35'),
 (': looking back', '47'),
 (': logic and reasoning in inquiry', '53'),
 ('  chapter', '4'),
 (' introduction to reasoning', '55'),
 ('  chapter', '5'),
 (' judging the truth of assertions', '69'),
 ('  chapter', '6'),
 (' language, logic, and truth in academic inquiry', '79'),
 (': looking back', '89'),
 (': more tools of inquiry', '91'),
 ('  chapter', '7'),
 ('  classifying', '93'),
 ('  chapter', '8'),
 (' generalising', '113'),
 ('  chapter', '9'),
 ('  defining', '129'),
 (': looking back', '141'),
 (': integrating the pieces', '143'),
 ('  chapter', '10'),
 (' justifying', '145'),
 ('  chapter', '11'),
 (' critical reading and critical thinking', '169'),
 (

In [51]:
sections = []
for section_name, page_number in matches:
    sections.append((section_name.strip(), int(page_number)))
    

In [52]:
sections

[('foundations  of knowledge and inquiry across disciplines  table of contents          acknowledgements',
  4),
 ('to the learner', 5),
 (': introduction', 7),
 ('chapter', 1),
 ('what this book is about', 9),
 ('chapter', 2),
 ('a broad overview', 23),
 ('chapter', 3),
 ('ways of knowing', 35),
 (': looking back', 47),
 (': logic and reasoning in inquiry', 53),
 ('chapter', 4),
 ('introduction to reasoning', 55),
 ('chapter', 5),
 ('judging the truth of assertions', 69),
 ('chapter', 6),
 ('language, logic, and truth in academic inquiry', 79),
 (': looking back', 89),
 (': more tools of inquiry', 91),
 ('chapter', 7),
 ('classifying', 93),
 ('chapter', 8),
 ('generalising', 113),
 ('chapter', 9),
 ('defining', 129),
 (': looking back', 141),
 (': integrating the pieces', 143),
 ('chapter', 10),
 ('justifying', 145),
 ('chapter', 11),
 ('critical reading and critical thinking', 169),
 ('chapter', 12),
 ('consolidation', 189),
 ('glossary', 203)]

In [57]:
pattern = r'Chapter (\d+)\s+([^…]+?)(?:…+|\s+)(\d+)'
matches = re.findall(pattern, full_text)


In [58]:
matches

[]

In [59]:
lines = [line.strip() for line in re.split(r'\s{2,}', full_text) if line.strip()]


In [60]:
lines

['3 foundations',
 'of knowledge and inquiry across disciplines',
 'table of contents',
 'acknowledgements …………………………………………… 4',
 'to the learner ……………………………………………………… 5',
 'part 1: introduction',
 '……………………………………… 7',
 'chapter 1 what this book is about',
 '…………………………… 9',
 'chapter 2 a broad overview ……………………………………… 23',
 'chapter 3 ways of knowing ……………………………………… 35',
 'part 1: looking back ………………………………………………… 47',
 'part 2: logic and reasoning in inquiry',
 '……………… 53',
 'chapter 4 introduction to reasoning …………………………… 55',
 'chapter 5 judging the truth of assertions ………………………',
 '69',
 'chapter 6 language, logic, and truth in academic inquiry … 79',
 'part 2: looking back ………………………………………………… 89',
 'part 3: more tools of inquiry …………………………… 91',
 'chapter 7',
 'classifying ………………………………………………… 93',
 'chapter 8 generalising …………………………………………… 113',
 'chapter 9',
 'defining ………………………………………………… 129',
 'part 3: looking back ………………………………………………… 141',
 'part 4: integrating the pieces ……………

In [61]:
cleaned_text = re.sub(r'[^a-zA-Z0-9 ]', '', full_text)

In [62]:
cleaned_text

'3 foundations  of knowledge and inquiry across disciplines  table of contents          acknowledgements  4  to the learner  5  part 1 introduction   7  chapter 1 what this book is about   9  chapter 2 a broad overview  23  chapter 3 ways of knowing  35  part 1 looking back  47  part 2 logic and reasoning in inquiry   53  chapter 4 introduction to reasoning  55  chapter 5 judging the truth of assertions   69  chapter 6 language logic and truth in academic inquiry  79  part 2 looking back  89  part 3 more tools of inquiry  91  chapter 7  classifying  93  chapter 8 generalising  113  chapter 9  defining  129  part 3 looking back  141  part 4 integrating the pieces  143  chapter 10 justifying  145  chapter 11 critical reading and critical thinking   169   chapter 12 consolidation  189  glossary  203'

In [67]:
pattern = r'chapter (\d+)\s+([a-zA-Z\s]+)\s+(\d+)'
matches = re.findall(pattern, cleaned_text)

In [68]:
matches

[('1', 'what this book is about  ', '9'),
 ('2', 'a broad overview ', '23'),
 ('3', 'ways of knowing ', '35'),
 ('4', 'introduction to reasoning ', '55'),
 ('5', 'judging the truth of assertions  ', '69'),
 ('6', 'language logic and truth in academic inquiry ', '79'),
 ('7', 'classifying ', '93'),
 ('8', 'generalising ', '113'),
 ('9', 'defining ', '129'),
 ('10', 'justifying ', '145'),
 ('11', 'critical reading and critical thinking  ', '169'),
 ('12', 'consolidation ', '189')]