In [1]:
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import re
from sentence_transformers import SentenceTransformer
import numpy as np
from qdrant_client import models, QdrantClient
# from langchain.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# loader = UnstructuredPDFLoader('./BG/Chapter1_War_Within_Verses.pdf', strategy='fast') 
# docs = loader.load()
# docs

In [5]:
loader = DirectoryLoader("../BG/", glob="**/*.pdf", show_progress=True, use_multithreading=True)
docs= loader.load()
docs

100%|██████████| 3/3 [00:27<00:00,  9.09s/it]


[Document(page_content='Chapter One The War Within\n\nSri Krishna consoles and instructs Prince Arjuna as he is about to go into battle against family and friends to defend his older brother’s claim to the ancient throne of the Kurus. Thus the great scripture called Bhagavad Gita, the “Song of the Lord,” begins. Sri Krishna is Bhagavan, “the Lord,” the mysterious incarnation of Lord Vishnu, the aspect of God who fosters and preserves the universe against the forces constantly working to destroy and corrupt it. Krishna has appeared on earth as a royal prince of the house of the Yadavas; thus he combines earthly majesty with a hidden spiritual power. Most know him only as an unimportant prince, but the wise have seen him reveal his power to destroy evil and protect the good.\n\nThe battle of the Bhagavad Gita is not Krishna’s fight, however; it is Arjuna’s. Krishna is only Arjuna’s charioteer and advisor. He has promised Arjuna that he will be with him throughout the ordeal, but much as 

In [6]:
verse_dos = [doc for doc in docs if 'Verses' in doc.metadata['source']] # filtering out the verses vs commentary

#### Process documents and add metadata

In [7]:
# convert list of strings to list of ints
def get_verses_from_text(doc):
    v1 = np.unique(np.array(re.findall(r"\d{1,2}", doc.page_content), dtype='int')) 
    d = np.abs(np.diff(v1))
    if len(v1) > 1 and any(d > 5): # if more than one verse they should be close to one another
        v1 = np.delete(v1, 0) #TODO find a more generic way to do this, right now works for 2 chapters
    
    v = np.array(v1, dtype='str')
    return list(v)

In [None]:
verse_dos

[Document(page_content='1: The War Within\n\nDHRITARASHTRA\n\n1 O Sanjaya, tell me what happened at Kurukshetra, the field of dharma, where my family and the Pandavas gathered to fight.\n\nSANJAYA\n\n2 Having surveyed the forces of the Pandavas arrayed for battle, prince Duryodhana approached his teacher, Drona, and spoke. 3 “O my teacher, look at this mighty army of the Pandavas, assembled by your own gifted disciple, Yudhishthira. 4 There are heroic warriors and great archers who are the equals of Bhima and Arjuna: Yuyudhana, Virata, the mighty Drupada, 5 Dhrishtaketu, Chekitana, the valiant king of Kashi, Purujit, Kuntibhoja,the great leader Shaibya, 6 the powerful Yudhamanyu, the valiant Uttamaujas, and the son of Subhadra, in addition to the sons of Draupadi. All these command mighty chariots.\n\n7 “O best of brahmins, listen to the names of those who are distinguished among our own forces: 8 Bhishma, Karna, and the victorious Kripa; Ashvatthama, Vikarna, and the son of Somadatta.

In [8]:
em_model = SentenceTransformer('all-MiniLM-L6-v2')
qdrant_client = QdrantClient("http://localhost:6333")

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=40,
    length_function=len,
    separators = ["\d"],
    is_separator_regex=True,
    keep_separator = True
)

all_chapters_docs = [] # list of tagged docs across all chapters

# texts = text_splitter.create_documents([docs[1].page_content])
for i in range (0, len(verse_dos)): # * * * * 
    chapters_docs = text_splitter.create_documents([verse_dos[i].page_content])
    # loop within those to add metadata
    for j in range(0, len(chapters_docs)):
        # TODO get author
        verses = get_verses_from_text(chapters_docs[j])
        chapters_docs[j].metadata = {'verses': verses, 'chapter': i}
        all_chapters_docs.append(chapters_docs[j])
    print(f"for Chapter {i} {len(chapters_docs)} verses added")

for Chapter 0 40 verses added
for Chapter 1 71 verses added


In [10]:
len(all_chapters_docs)

111

In [11]:
finalDocuments = []
for i in range(0, len(all_chapters_docs)):
    finalDocuments.append({
        "original_text": all_chapters_docs[i].page_content,
        "cleaned_text": re.sub(r'\d|(\n\n)', '', all_chapters_docs[i].page_content),
        "verses" : all_chapters_docs[i].metadata["verses"],
        "chapter" : all_chapters_docs[i].metadata["chapter"]
    })
len(finalDocuments)

111

#### Load and save embeddings to Qdrant

In [None]:
qdrant_client.recreate_collection(
    collection_name="BG1and2",
    vectors_config=models.VectorParams(
        size=em_model.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [None]:
qdrant_client.upload_records(
    collection_name="BG1and2",
    records=[
        models.Record(
            id=idx, vector=em_model.encode(doc["cleaned_text"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(finalDocuments)
    ],
)

In [22]:
# query = "war with family members"
# query= "immortality"
# query = "what is said about reincarnation"
query = "What is the self?"

In [23]:
hits = qdrant_client.search(
    collection_name="BG1and2",
    # query_filter=models.Filter(
    #     must=[
    #         models.FieldCondition(
    #             key="chapter",
    #             match=models.MatchValue(
    #                 value=1,
    #             ),
    #         )
    #     ]
    # ),
    query_vector=em_model.encode(query).tolist(),
    limit=10,
)
for hit in hits:
    # print(hit.payload)
    print(f'{hit.payload["cleaned_text"]} >> Chapter: {hit.payload["chapter"]} Verse : {hit.payload["verses"]}, Score : {hit.score}')

 The glory of the Self is beheld by a few, and a few describe it; a few listen, but many without understanding.  >> Chapter: 1 Verse : ['29'], Score : 0.62122476
 The Self cannot be pierced or burned, made wet or dry. It is everlasting and infinite, standing on the motionless foundations of eternity.  >> Chapter: 1 Verse : ['24'], Score : 0.52031195
 As one abandons worn-out clothes and acquires new ones, so when the body is worn out a new one is acquired by the Self, who lives within. >> Chapter: 1 Verse : ['22'], Score : 0.49016458
 The Self is unmanifested, beyond all thought, beyond all change. Knowing this, you should not grieve. >> Chapter: 1 Verse : ['25'], Score : 0.4427987
 Those who follow this path, resolving deep within themselves to seek me alone, attain singleness of purpose. For those who lack resolution, the decisions of life are many-branched and endless. >> Chapter: 1 Verse : ['41'], Score : 0.42654747
 The Self cannot be pierced by weapons or burned by fire; water ca