In [1]:
from langchain.retrievers import ParentDocumentRetriever



In [2]:
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
# from langchain_community.document_loaders import PyPDFLoader

# loader = PyPDFLoader("raw_data/History_of_India_2nd_ed.pdf")
# pages = loader.load_and_split()

In [21]:
from langchain_community.document_loaders import PyPDFLoader

# Function to load and filter out specific pages
def load_pdf_excluding_pages(pdf_path, pages_to_skip):
    loader = PyPDFLoader(pdf_path)
    all_pages = loader.load_and_split()
    
    # Filter out the pages to skip
    filtered_pages = [page for i, page in enumerate(all_pages) if i not in pages_to_skip]
    return filtered_pages

# Specify the path to your PDF and the pages you want to skip (zero-based index)
pdf_path = "raw_data/History_of_India_2nd_ed.pdf"
pages_to_skip = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] + list(range(455, 469))  # Example: Skipping the first three pages table of contents

In [22]:
# Load the PDF excluding the specified pages
filtered_pages = load_pdf_excluding_pages(pdf_path, pages_to_skip)

# Now you can process the filtered pages
for page in filtered_pages[0:5]:
    print(page)

page_content='xiv INTRODUCTION TO BURTON \nSTEIN ’ S  A HISTORY OF INDIA      \n  David Arnold \n Burton Stein ’ s  A History of India  is one of the most ambitious histories of the \nsubcontinent ever undertaken, certainly by an individual scholar and in a \nsingle - volume work. First published in 1998, it proﬁ  ted from the decades of \nintensive scholarly research that had been carried out by Western and South Asian scholars since the 1950s and to which Stein himself was an active and inﬂ uential contributor. Although in the  History  Stein seldom refers explicitly \nto speciﬁ  c historians, their views are at least implicit in his critical treatment of Indian feudalism, the nature of the Vijayanagara  ‘ empire ’ , the decline of \nthe Mughals and the personality and leadership of M. K. Gandhi. It is evident, too, even if not foregrounded, in the wealth of historiography and historical debate, which Stein himself found  ‘ marvellously stimulating ’ , that consistently \nunderpins a

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Equivalent to SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [23]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embeddings
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [24]:
retriever.add_documents(filtered_pages, ids=None)

In [25]:
list(store.yield_keys())

['2b10942b-f777-4057-8797-994525156fa8',
 'a756fc02-754b-4f0b-8cef-679c30ca3b9b',
 '9adaf858-003a-4b29-aaf5-123f2cf81fb9',
 '711da3dd-78d7-496c-8431-edf7241f0806',
 'dd747fab-4e0a-41c3-9f72-59966f23307d',
 'cc86c50b-2464-4e17-b33b-7ff362804e6c',
 '94310c19-64e0-4ea3-89f6-9a4eee9e59ee',
 'a86c60f8-5046-4df1-bc83-5d9689834152',
 '8e24d2a0-4b2b-4277-81c2-38005f9c8645',
 '5e5d6318-95e1-4d14-8a14-b0119012dda2',
 'c072a59c-1b65-470c-8cca-e15eb0e6a46b',
 'f62879f9-cfe8-4677-b5d5-0ebf7da62746',
 '4ae78e66-9dbd-4435-a85c-dae45dde3364',
 '48025de7-d5da-4df8-bc9c-0d18044cabaf',
 '0001a9f0-e197-4c5c-83b9-8a75087a8bb3',
 '61717e40-c303-428c-989d-cc3f4454453d',
 '4e959150-b8e0-40e5-a526-35cd42691e1d',
 'f35fa2c3-adbb-42df-8a5b-922d118f9513',
 'bc9d87f3-63a6-4aba-9ed5-0c2be963fd00',
 'bcd1b808-d8a4-4cc1-b9b0-ab172eba9bf4',
 '0a890d16-9ef9-446d-84eb-f68d91b138b7',
 'c8c7367a-3206-4772-b06c-739898f01009',
 '506f9589-1d21-4fca-a403-79581a3cb18c',
 '6167020e-0870-44e2-b96e-0d6d88d03595',
 '9ef6ead4-5dd9-

In [26]:
sub_docs = vectorstore.similarity_search("rajasthan")

In [27]:
print(sub_docs[0].page_content)

Contemporary South Asia
380incident involving the members of the Bishnoi sect of Rajasthan, which was


In [28]:
retrieved_docs = retriever.invoke("rajasthan")

In [29]:
len(retrieved_docs[0].page_content)

3251

In [30]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embeddings
)
# The storage layer for the parent documents
store = InMemoryStore()

In [31]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [32]:
retriever.add_documents(filtered_pages)

In [33]:
len(list(store.yield_keys()))

834

In [34]:
sub_docs = vectorstore.similarity_search("rajasthan")
print(sub_docs[0].page_content)

Rajasthan in the northwest or Karnataka in the central peninsula  –  there 
appeared one sort of conﬁ  guration of community and kingdom, while in the


In [35]:
retrieved_docs = retriever.invoke("rajasthan")
len(retrieved_docs[0].page_content)

512

In [36]:
print(retrieved_docs[0].page_content)

Rajasthan in the northwest or Karnataka in the central peninsula  –  there 
appeared one sort of conﬁ  guration of community and kingdom, while in the 
Gangetic plain and the Coromandel plain in the south there was another. In Rajasthan and Karnataka, the caste culture of the medieval age was known, but the hierarchical practices of caste relations were attenuated by the prin-ciples of clan organization of the farming communities and their artisan and priestly clients that now characterized the countryside.
