In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

In [2]:
# already done this
index_data = [('1', 'what this book is about  ', '9'),
 ('2', 'a broad overview ', '23'),
 ('3', 'ways of knowing ', '35'),
 ('4', 'introduction to reasoning ', '55'),
 ('5', 'judging the truth of assertions  ', '69'),
 ('6', 'language logic and truth in academic inquiry ', '79'),
 ('7', 'classifying ', '93'),
 ('8', 'generalising ', '113'),
 ('9', 'defining ', '129'),
 ('10', 'justifying ', '145'),
 ('11', 'critical reading and critical thinking  ', '169'),
 ('12', 'consolidation ', '189')]

In [12]:
for chp in index_data:
    chapter_number, chapter_name, starting_page_number = chp
    print(chapter_name)

what this book is about  
a broad overview 
ways of knowing 
introduction to reasoning 
judging the truth of assertions  
language logic and truth in academic inquiry 
classifying 
generalising 
defining 
justifying 
critical reading and critical thinking  
consolidation 


In [4]:
# load pdf
loader = PyPDFLoader("trimmed_inputPDF.pdf")
pages = loader.load()


In [26]:
pages[25].metadata.get('page_label')

'26'

In [28]:
def findChapterData(page_label):
    chapter_name = ''
    chapter_number = 0
    for chp in index_data:
        chp_num, chp_name, starting_page = chp
        if (int(page_label) > int(starting_page, 10)):
            chapter_name = chp_name
            chapter_number = chp_num
        else:
            return chapter_name, chapter_number
    return '', ''
        

In [29]:
chapter_number, chapter_name = findChapterData(24)
print('\n \n')
print(chapter_number, chapter_name)


 

a broad overview  2


In [42]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = []
metadatas = []
documents = []
for page in pages:
    chapter_name, chapter_number = findChapterData(page.metadata.get('page_label'))
    # print(f'this is the chapter_name and chapter_number', chapter_name, chapter_number)
    if (chapter_number==''  and chapter_name==''):
        print(f'This page is empty', page.metadata.get('page_label'))
    page_chunks = text_splitter.split_documents([page])
    for chunk in page_chunks:
        chunks.append(chunk)
        # Propagate page number metadata from the parent page
        data = {"page": page.metadata.get("page_label")}
        if chapter_name: 
            data['chapter_name'] = chapter_name
        if chapter_number:
            data['chapter_number'] = int(chapter_number)
        metadatas.append(data)
        documents.append(Document(page_content=chunk.page_content, metadata=data))

This page is empty 190
This page is empty 191
This page is empty 192
This page is empty 193
This page is empty 194
This page is empty 195
This page is empty 196
This page is empty 197
This page is empty 198


In [43]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [45]:
documents

[Document(metadata={'page': '1'}, page_content='FOUNDATIONS OF KNOWLEDGE AND INQUIRY ACROSS DISCIPLINES   \nTara Mohanan and K P Mohanan    \n“If education were for living and not merely for livelihood; if education were for joy and happiness and not merely for temporal success; … if education were as much for service as it is for self-seeking; if education were as much for wisdom and truth as it is for so-called facts; … then, indeed, would the younger generation be well-equipped for life.” Dr. George Sydney Arundale, Dynamic Education. In C. Roberts (ed.) What India Thinks:  Being a Symposium of Thought Contributed by  Fifty Men and Women Having India’s Interests at Heart'),
 Document(metadata={'page': '2'}, page_content='2 \n     I was born not knowing and have had only a little time to change that here and there.  Richard P Feynman                    \n CREATIVE COMMONS: ATTRIBUTION - NON COMMERCIAL - SHARE ALIKE  [This license allows others to remix, adapt, and build upon this wor

In [44]:
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name="book_by_page",
    persist_directory="embedding"
)