In [1]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3",
    temperature=0,
)

In [2]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [4]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader("/home/sneha/langchain-rag/IndianConstitution_Eng.pdf")
docs = loader.load()

In [None]:
# import requests
# from bs4 import BeautifulSoup

# # URL of the Bhagavad Gita site
# url = "https://bhagavadgita.io/"

# # Send a GET request to fetch the page content
# response = requests.get(url)

# # Parse the page content with BeautifulSoup
# soup = BeautifulSoup(response.content, "html.parser")

# # Find all h2 elements with the class 'font-bold text-my-orange'
# chapter_names = soup.find_all('h2', class_='font-bold text-my-orange')


# # Find all h3 elements with the class 'text-xl font-bold dark:text-white'
# chapter_titles = soup.find_all('h3', class_='text-xl font-bold dark:text-white')

# # Find all span elements with the class 'mb-0.5'
# verse_info = soup.find_all('span', class_='mb-0.5')

# chapter_names = [chapter_name.get_text(strip=True) for chapter_name in chapter_names]
# chapter_titles = [chapter_title.get_text(strip=True) for chapter_title in chapter_titles]
# verse_info = [int(verse.get_text(strip=True)[:2]) for verse in verse_info]

# import pandas as pd
# pd.DataFrame({
#     "Chapter Name": chapter_names,
#     "Chapter Title": chapter_titles,
#     "Verse Info": verse_info
# }).to_csv("bhagavad_gita.csv", index=False)

In [67]:
import pandas as pd
df = pd.read_csv("bhagavad_gita.csv")

In [68]:
df

Unnamed: 0,Chapter Name,Chapter Title,Verse Info
0,Chapter1,Arjuna Visada Yoga,47
1,Chapter2,Sankhya Yoga,72
2,Chapter3,Karma Yoga,43
3,Chapter4,Jnana Karma Sanyasa Yoga,42
4,Chapter5,Karma Sanyasa Yoga,29
5,Chapter6,Dhyana Yoga,47
6,Chapter7,Gyaan Vigyana Yoga,30
7,Chapter8,Akshara Brahma Yoga,28
8,Chapter9,Raja Vidya Yoga,34
9,Chapter10,Vibhooti Yoga,42


In [69]:
df["urls"] = df.apply(lambda row: [f"https://bhagavadgita.io/chapter/{row.name+1}/verse/{i}/" for i in range(1, row["Verse Info"]+1)], axis=1)


In [70]:
df = df.explode("urls").reset_index(drop=True)

In [78]:
df["Verse Info"]=df["urls"].apply(lambda x: x.split("/")[-2])

In [84]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
import pandas as pd
from tqdm import tqdm  # Corrected the typo here

# Assuming the DataFrame is named df and already has a column "urls"
# List to store the content of all URLs
all_docs = []

# Use tqdm to iterate through each URL in the 'urls' column with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Loading URLs"):
    url = row['urls']  # Get the URL from the "urls" column
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs={"parse_only": bs4.SoupStrainer(["h1", "p"])},  # Specify the tags to parse
    )
    docs = loader.load()
    
    if docs:
        # Extract the content from the loaded document
        all_docs.append(docs[0].page_content)

# Output total number of characters from all documents
total_characters = sum(len(doc) for doc in all_docs)
print(f"Total characters across all documents: {total_characters}")


Loading URLs:  13%|█▎        | 93/701 [02:19<15:11,  1.50s/it]


KeyboardInterrupt: 

In [85]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to load and process each URL
def process_url(url):
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs={"parse_only": bs4.SoupStrainer(["h1", "p"])}
    )
    docs = loader.load()
    if docs:
        return docs[0].page_content
    return None

# List to store the content of all URLs
all_docs = []

# Create a ThreadPoolExecutor to parallelize the URL fetching
with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers based on your system capacity
    # Generate futures for each URL in the 'urls' column
    futures = {executor.submit(process_url, row['urls']): index for index, row in df.iterrows()}
    
    # Track progress
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing URLs"):
        result = future.result()
        if result:
            all_docs.append(result)

# Output total number of characters from all documents
total_characters = sum(len(doc) for doc in all_docs)
print(f"Total characters across all documents: {total_characters}")


Processing URLs: 100%|██████████| 701/701 [01:37<00:00,  7.22it/s]

Total characters across all documents: 1249020





In [88]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to load and process each URL
def process_url(row):
    chapter_number = row['Chapter Name'] # Index + 1 for chapter number
    chapter_name = row['Chapter Title']
    verse_info = row['Verse Info']
    url = row['urls']
    
    # Load the page content using WebBaseLoader
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs={"parse_only": bs4.SoupStrainer(["h1", "p"])}  # Specify the tags to parse
    )
    docs = loader.load()
    
    if docs:
        # Extract the content and return it along with metadata
        page_content = docs[0].page_content
        return {
            'chapter_number': chapter_number,
            'chapter_name': chapter_name,
            'verse_info': verse_info,
            'page_content': page_content
        }
    return None

# List to store the content of all URLs along with metadata
all_docs_with_metadata = []

# Create a ThreadPoolExecutor to parallelize the URL fetching
with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers based on your system capacity
    # Generate futures for each URL in the 'urls' column
    futures = {executor.submit(process_url, row): index for index, row in df.iterrows()}
    
    # Track progress
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing URLs"):
        result = future.result()
        if result:
            all_docs_with_metadata.append(result)

# # Convert the list of metadata and content to a DataFrame
# metadata_df = pd.DataFrame(all_docs_with_metadata)

# # Output the first few rows of the new DataFrame
# print(metadata_df.head())

# # Optionally, you can save the metadata and content to a CSV or JSON file
# metadata_df.to_csv("chapter_verse_metadata.csv", index=False)  # Save to CSV
# # metadata_df.to_json("chapter_verse_metadata.json", orient="records", lines=True)  # Save to JSON


Processing URLs: 100%|██████████| 701/701 [01:35<00:00,  7.36it/s]


In [100]:
len(all_docs_with_metadata)

701

In [101]:
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm

# Use Ollama embedding function (requires Ollama to be running)
ollama_embedding = embedding_functions.OllamaEmbeddingFunction(model_name="nomic-embed-text")

# Create or connect to Chroma DB collection
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(
    name="bhagavadgita",
    embedding_function=ollama_embedding
)

# Add documents with metadata and track progress
for i, doc in tqdm(enumerate(all_docs_with_metadata), total=len(all_docs_with_metadata), desc="Embedding & Storing"):
    content = doc["page_content"]
    metadata = {
        "chapter_number": str(doc["chapter_number"]),
        "chapter_name": doc["chapter_name"],
        "verse_info": str(doc["verse_info"]),
    }

    # Add to ChromaDB (must have unique IDs)
    collection.add(
        documents=[content],
        metadatas=[metadata],
        ids=[f"verse-{i}-{doc["chapter_number"]}"]
    )


Embedding & Storing:  30%|███       | 212/701 [01:28<03:24,  2.39it/s]


KeyboardInterrupt: 

In [102]:
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import signal
import os

# Init chroma collection
embedding_fn = embedding_functions.OllamaEmbeddingFunction(model_name="llama3")

client = chromadb.Client()
collection = client.get_or_create_collection(name="bhagavadgita", embedding_function=embedding_fn)

# Ensure this is global so each process can access
global_all_docs = None  # We'll assign this below


# To gracefully handle multiprocessing exit
def init_worker():
    signal.signal(signal.SIGINT, signal.SIG_IGN)


def embed_and_store(i):
    doc = global_all_docs[i]
    content = doc["page_content"]
    metadata = {
        "chapter_number": str(doc["chapter_number"]),
        "chapter_name": doc["chapter_name"],
        "verse_info": str(doc["verse_info"]),
    }
    doc_id = f"verse-{i}-{doc['chapter_number']}"

    # Embed and store
    collection.add(
        documents=[content],
        metadatas=[metadata],
        ids=[doc_id],
    )
    return doc_id


# === Entry point ===
def run_parallel_embedding(all_docs):
    global global_all_docs
    global_all_docs = all_docs  # assign to global variable for multiprocessing

    try:
        with Pool(processes=cpu_count(), initializer=init_worker) as pool:
            list(tqdm(pool.imap_unordered(embed_and_store, range(len(all_docs))), total=len(all_docs), desc="Embedding & Storing"))
    except KeyboardInterrupt:
        print("Multiprocessing interrupted. Some documents may be missing.")


# --- Call this with your data ---
run_parallel_embedding(all_docs_with_metadata)


Embedding & Storing:   0%|          | 0/701 [00:29<?, ?it/s]

Multiprocessing interrupted. Some documents may be missing.





In [None]:
docs[0].page_content[:1000]


'BG 1.1धृतराष्ट्र उवाच\n\nधर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।\n\nमामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय।।1.1।।\n dhṛitarāśhtra uvācha\ndharma-kṣhetre kuru-kṣhetre samavetā yuyutsavaḥ\nmāmakāḥ pāṇḍavāśhchaiva kimakurvata sañjaya\ndhṛitarāśhtraḥ uvācha—Dhritarashtra said; dharma-kṣhetre—the land of dharma; kuru-kṣhetre—at Kurukshetra; samavetāḥ—having gathered; yuyutsavaḥ—desiring to fight; māmakāḥ—my sons; pāṇḍavāḥ—the sons of Pandu; cha—and; eva—certainly; kim—what; akurvata—did they do; sañjaya—Sanjay\nDhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"1.1 धर्मक्षेत्रे on the holy plain? कुरुक्षेत्रे in Kurukshetra? समवेताः assembled together? युयुत्सवः desirous to fight? मामकाः my people? पाण्डवाः the sons of Pandu? च and? एव also? किम् what? अकुर्वत did do? सञ्जय O Sanjaya.Commentary Dharmakshetra -- that place which protects Dharma is Dharmakshetra. Because it was in 

In [75]:

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 1169


In [76]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 3 sub-documents.


In [77]:
from tqdm import tqdm
from langchain_community.vectorstores import Chroma

# Create or load your Chroma vector store
vector_store = Chroma(embedding_function=embeddings, persist_directory="chroma_db")


# Add documents in batches with a progress bar
for i in tqdm(range(0, len(all_splits)), desc="Indexing chunks"):
    chunk = [all_splits[i]]
    vector_store.add_documents(chunk)

# Save the vector store to disk
vector_store.persist()


NameError: name 'embeddings' is not defined

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)



You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [None]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
result = graph.invoke({"question": "What are some law related to women?"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Context: [Document(metadata={'producer': 'Acrobat Distiller 5.0 (Windows)', 'start_index': 46605, 'creationdate': '2005-11-11T12:20:49+05:30', 'total_pages': 291, 'source': '/home/sneha/langchain-rag/IndianConstitution_Eng.pdf', 'moddate': '2005-11-16T12:43:55+05:30', 'title': 'PREFACE', 'creator': 'Acrobat PDFMaker 5.0 for Word', 'author': 'New user'}, page_content='(3) In this article, unless the context otherwise requires,— \n\n(a)  “law” \n\nincludes  any  Ordinance,  order,  bye-law,  rule,  regulation, \n\nnotification, custom or usage having in the territory of India the force of law; \n\n(b)  “laws  in  force”  includes  laws  passed  or  made  by  a  Legislature  or  other \ncompetent  authority  in  the  territory  of  India  before  the  commencement  of  this \nConstitution and not previously repealed, notwithstanding that any such law or \nany part thereof may not be then in operation either at all or in particular areas. \n(4) Nothing in this article shall apply to any am

In [None]:
result = graph.invoke({"question": "What are some law related to men protection from women. Are indian law weak for men?"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Context: [Document(metadata={'source': '/home/sneha/langchain-rag/IndianConstitution_Eng.pdf', 'moddate': '2005-11-16T12:43:55+05:30', 'total_pages': 291, 'producer': 'Acrobat Distiller 5.0 (Windows)', 'start_index': 46605, 'author': 'New user', 'creationdate': '2005-11-11T12:20:49+05:30', 'title': 'PREFACE', 'creator': 'Acrobat PDFMaker 5.0 for Word'}, page_content='(3) In this article, unless the context otherwise requires,— \n\n(a)  “law” \n\nincludes  any  Ordinance,  order,  bye-law,  rule,  regulation, \n\nnotification, custom or usage having in the territory of India the force of law; \n\n(b)  “laws  in  force”  includes  laws  passed  or  made  by  a  Legislature  or  other \ncompetent  authority  in  the  territory  of  India  before  the  commencement  of  this \nConstitution and not previously repealed, notwithstanding that any such law or \nany part thereof may not be then in operation either at all or in particular areas. \n(4) Nothing in this article shall apply to any am

In [None]:
result = graph.invoke({"question": "What are some law related to men protection from women. Are indian law weak for men?"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Context: [Document(metadata={'source': '/home/sneha/langchain-rag/IndianConstitution_Eng.pdf', 'moddate': '2005-11-16T12:43:55+05:30', 'total_pages': 291, 'producer': 'Acrobat Distiller 5.0 (Windows)', 'start_index': 46605, 'author': 'New user', 'creationdate': '2005-11-11T12:20:49+05:30', 'title': 'PREFACE', 'creator': 'Acrobat PDFMaker 5.0 for Word'}, page_content='(3) In this article, unless the context otherwise requires,— \n\n(a)  “law” \n\nincludes  any  Ordinance,  order,  bye-law,  rule,  regulation, \n\nnotification, custom or usage having in the territory of India the force of law; \n\n(b)  “laws  in  force”  includes  laws  passed  or  made  by  a  Legislature  or  other \ncompetent  authority  in  the  territory  of  India  before  the  commencement  of  this \nConstitution and not previously repealed, notwithstanding that any such law or \nany part thereof may not be then in operation either at all or in particular areas. \n(4) Nothing in this article shall apply to any am